diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 000000000..9a898f421 --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,177 @@ +macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:latest + +#task: +# name: AppleM1/LLVM +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - make TARGET=VORTEX USE_OPENMP=1 CC=clang + +#task: +# name: AppleM1/LLVM/ILP64 +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 + +#task: +# name: AppleM1/LLVM/CMAKE +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - mkdir build +# - cd build +# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. +# - make -j 4 + +#task: +# name: AppleM1/GCC/MAKE/OPENMP +# compile_script: +# - brew install gcc@11 +# - export PATH=/opt/homebrew/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/lib" +# - export CPPFLAGS="-I/opt/homebrew/include" +# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 + +macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:latest +task: + name: AppleM1/LLVM x86_64 xbuild + compile_script: + - #brew install llvm + - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - export ARCHS="i386 x86_64" + - export ARCHS_STANDARD="i386 x86_64" + - export ARCHS_STANDARD_32_64_BIT="i386 x86_64" + - export ARCHS_STANDARD_64_BIT=x86_64 + - export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64" + - export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64" + - export VALID_ARCHS="i386 x86_64" + - xcrun --sdk macosx --show-sdk-path + - xcodebuild -version + - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64" + - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" + always: + config_artifacts: + path: "*conf*" + type: text/plain +# lib_artifacts: +# path: "libopenblas*" +# type: application/octet-streamm + +macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:latest +task: + name: AppleM1/LLVM armv8-ios xbuild + compile_script: + - #brew install llvm + - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" + - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 + always: + config_artifacts: + path: "*conf*" + type: text/plain + +macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:latest +task: + name: AppleM1/LLVM armv7-androidndk xbuild + compile_script: + - #brew install android-ndk + - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib" + - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" + - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang + - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" + always: + config_artifacts: + path: "*conf*" + type: text/plain + +task: + name: NeoverseN1 + arm_container: + image: node:latest + compile_script: + - make + + task: + name: NeoverseN1-ILP64 + arm_container: + image: node:latest + compile_script: + - make INTERFACE64=1 + +task: + name: NeoverseN1-OMP + arm_container: + image: node:latest + cpu: 8 + compile_script: + - make USE_OPENMP=1 + +FreeBSD_task: + name: FreeBSD-gcc12 + freebsd_instance: + image_family: freebsd-13-2 + install_script: + - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc + compile_script: + - ls -l /usr/local/lib + - gmake CC=gcc + + +FreeBSD_task: + name: freebsd-gcc12-ilp64 + freebsd_instance: + image_family: freebsd-13-2 + install_script: + - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc + compile_script: + - ls -l /usr/local/lib + - gmake CC=gcc INTERFACE64=1 + +FreeBSD_task: + name: FreeBSD-clang-openmp + freebsd_instance: + image_family: freebsd-13-2 + install_script: + - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc + - ln -s /usr/local/lib/gcc12/libgfortran.so.5.0.0 /usr/lib/libgfortran.so + compile_script: + - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1 + +#task: +# name: Windows/LLVM16 --- too slow --- +# windows_container: +# image: cirrusci/windowsservercore:cmake-2021.12.07 +# install_script: +# - choco list --localonly +# - choco install -y llvm +# - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"' +# - choco install -y ninja +# - refreshenv +# - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build" +# - vcvarsall x64 +# - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build" +# - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release +# - cd build +# - cmake --build . +# - ctest diff --git a/.cirun.yml b/.cirun.yml new file mode 100644 index 000000000..bfc6494d0 --- /dev/null +++ b/.cirun.yml @@ -0,0 +1,16 @@ +# Self-Hosted Github Action Runners on AWS via Cirun.io +# Reference: https://docs.cirun.io/reference/yaml +runners: + - name: "aws-runner-graviton" + # Cloud Provider: AWS + cloud: "aws" + region: "us-east-1" + # Cheapest VM on AWS + instance_type: "c7g.large" + # Ubuntu-22.04, ami image + machine_image: "ami-0a0c8eebcdd6dcbd0" + preemptible: false + # Add this label in the "runs-on" param in .github/workflows/.yml + # So that this runner is created for running the workflow + labels: + - "cirun-aws-runner-graviton" diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml new file mode 100644 index 000000000..e34eada86 --- /dev/null +++ b/.github/workflows/apple_m.yml @@ -0,0 +1,149 @@ +name: apple m + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: macos-14 + + strategy: + fail-fast: false + matrix: + build: [cmake, make] + fortran: [gfortran] + openmp: [0, 1] + ilp64: [0, 1] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + elif [ "$RUNNER_OS" == "macOS" ]; then + sysctl -a | grep machdep.cpu + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get install -y gfortran cmake ccache libtinfo5 + elif [ "$RUNNER_OS" == "macOS" ]; then + # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. + brew reinstall gcc + brew install coreutils cmake ccache + brew install llvm + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + # GNU make and cmake call the compilers differently. It looks like + # that causes the cache to mismatch. Keep the ccache for both build + # tools separate to avoid polluting each other. + key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }} + ccache-${{ runner.os }}-${{ matrix.build }} + + - name: Configure ccache + run: | + if [ "${{ matrix.build }}" = "make" ]; then + # Add ccache to path + if [ "$RUNNER_OS" = "Linux" ]; then + echo "/usr/lib/ccache" >> $GITHUB_PATH + elif [ "$RUNNER_OS" = "macOS" ]; then + echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH + echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH + echo "" >>$GITHUB_PATH + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + fi + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Build OpenBLAS + run: | + export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + export CC="/opt/homebrew/opt/llvm/bin/clang" + case "${{ matrix.build }}" in + "make") + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}" + ;; + "cmake") + export LDFLAGS="$LDFLAGS -Wl,-ld_classic" + mkdir build && cd build + cmake -DDYNAMIC_ARCH=1 \ + -DUSE_OPENMP=${{matrix.openmp}} \ + -DINTERFACE64=${{matrix.ilp64}} \ + -DNOFORTRAN=0 \ + -DBUILD_WITHOUT_LAPACK=0 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ + .. + cmake --build . + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Run tests + timeout-minutes: 60 + run: | + case "${{ matrix.build }}" in + "make") + MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' + echo "::group::Tests in 'test' directory" + make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'ctest' directory" + make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'utest' directory" + make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + ;; + "cmake") + cd build && ctest + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml new file mode 100644 index 000000000..6928312b5 --- /dev/null +++ b/.github/workflows/arm64_graviton.yml @@ -0,0 +1,139 @@ +name: arm64 graviton cirun + +on: + push: + branches: + - develop + - release-** + pull_request: + branches: + - develop + - release-** + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}" + + strategy: + fail-fast: false + matrix: + fortran: [gfortran] + build: [cmake, make] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt update + sudo apt-get install -y gfortran cmake ccache libtinfo5 + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + # GNU make and cmake call the compilers differently. It looks like + # that causes the cache to mismatch. Keep the ccache for both build + # tools separate to avoid polluting each other. + key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }} + ccache-${{ runner.os }}-${{ matrix.build }} + + - name: Configure ccache + run: | + if [ "${{ matrix.build }}" = "make" ]; then + # Add ccache to path + if [ "$RUNNER_OS" = "Linux" ]; then + echo "/usr/lib/ccache" >> $GITHUB_PATH + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + fi + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Build OpenBLAS + run: | + case "${{ matrix.build }}" in + "make") + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" + ;; + "cmake") + mkdir build && cd build + cmake -DDYNAMIC_ARCH=1 \ + -DNOFORTRAN=0 \ + -DBUILD_WITHOUT_LAPACK=0 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ + .. + cmake --build . + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Run tests + timeout-minutes: 60 + run: | + case "${{ matrix.build }}" in + "make") + MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' + echo "::group::Tests in 'test' directory" + make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'ctest' directory" + make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'utest' directory" + make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + ;; + "cmake") + cd build && ctest + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml new file mode 100644 index 000000000..68ba2ddd7 --- /dev/null +++ b/.github/workflows/c910v.yml @@ -0,0 +1,126 @@ +name: c910v qemu test + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-latest + env: + xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618 + toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz + strategy: + fail-fast: false + matrix: + include: + - target: RISCV64_GENERIC + triple: riscv64-linux-gnu + apt_triple: riscv64-linux-gnu + opts: NO_SHARED=1 TARGET=RISCV64_GENERIC + - target: C910V + triple: riscv64-unknown-linux-gnu + apt_triple: riscv64-linux-gnu + opts: NO_SHARED=1 TARGET=C910V + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross + + - name: checkout qemu + uses: actions/checkout@v3 + with: + repository: T-head-Semi/qemu + path: qemu + ref: 1e692ebb43d396c52352406323fc782c1ac99a42 + + - name: build qemu + run: | + # Force use c910v qemu-user + wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + cd qemu + patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + make -j$(nproc) + make install + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS + run: | + wget ${xuetie_toolchain}/${toolchain_file_name} + tar -xvf ${toolchain_file_name} -C /opt + export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH" + + make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH + qemu-riscv64 ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index ff40b354d..669aa8116 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -2,11 +2,16 @@ name: continuous build on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: contents: read # to fetch code (actions/checkout) jobs: build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ${{ matrix.os }} strategy: @@ -37,6 +42,7 @@ jobs: - name: Install Dependencies run: | if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update sudo apt-get install -y gfortran cmake ccache libtinfo5 elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. @@ -146,45 +152,59 @@ jobs: msys2: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: windows-latest strategy: fail-fast: false matrix: - msystem: [MINGW64, MINGW32, CLANG64] + msystem: [UCRT64, MINGW32, CLANG64, CLANG32] idx: [int32, int64] build-type: [Release] include: - - msystem: MINGW64 + - msystem: UCRT64 idx: int32 - target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + target-prefix: mingw-w64-ucrt-x86_64 + fc-pkg: fc - msystem: MINGW32 idx: int32 target-prefix: mingw-w64-i686 - fc-pkg: mingw-w64-i686-gcc-fortran + fc-pkg: fc - msystem: CLANG64 idx: int32 target-prefix: mingw-w64-clang-x86_64 + fc-pkg: fc + # Compiling with Flang 16 seems to cause test errors on machines + # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. + no-avx512-flags: -DNO_AVX512=1 + - msystem: CLANG32 + idx: int32 + target-prefix: mingw-w64-clang-i686 + fc-pkg: cc c-lapack-flags: -DC_LAPACK=ON - - msystem: MINGW64 + - msystem: UCRT64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 - target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + target-prefix: mingw-w64-ucrt-x86_64 + fc-pkg: fc - msystem: CLANG64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 - c-lapack-flags: -DC_LAPACK=ON - - msystem: MINGW64 + fc-pkg: fc + # Compiling with Flang 16 seems to cause test errors on machines + # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. + no-avx512-flags: -DNO_AVX512=1 + - msystem: UCRT64 idx: int32 - target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + target-prefix: mingw-w64-ucrt-x86_64 + fc-pkg: fc build-type: None exclude: - msystem: MINGW32 idx: int64 + - msystem: CLANG32 + idx: int64 defaults: run: @@ -209,7 +229,7 @@ jobs: install: >- base-devel ${{ matrix.target-prefix }}-cc - ${{ matrix.fc-pkg }} + ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }} ${{ matrix.target-prefix }}-cmake ${{ matrix.target-prefix }}-ninja ${{ matrix.target-prefix }}-ccache @@ -217,14 +237,21 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - - name: Compilation cache - uses: actions/cache@v3 - with: - # It looks like this path needs to be hard-coded. - path: C:/msys64/home/runneradmin/.ccache + - name: Prepare ccache + # Get cache location of ccache + # Create key that is used in action/cache/restore and action/cache/save steps + id: ccache-prepare + run: | + echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT # We include the commit sha in the cache key, as new cache entries are # only created if there is no existing entry for the key yet. - key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }} + echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT + + - name: Restore ccache + uses: actions/cache/restore@v3 + with: + path: ${{ steps.ccache-prepare.outputs.ccachedir }} + key: ${{ steps.ccache-prepare.outputs.key }} # Restore a matching ccache cache entry. Prefer same branch. restore-keys: | ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }} @@ -234,9 +261,10 @@ jobs: # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. run: | which ccache - test -d ~/.ccache || mkdir -p ~/.ccache - echo "max_size = 250M" > ~/.ccache/ccache.conf - echo "compression = true" >> ~/.ccache/ccache.conf + test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }} + echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf + echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf + ccache -p ccache -s echo $HOME cygpath -w $HOME @@ -253,6 +281,7 @@ jobs: -DTARGET=CORE2 \ ${{ matrix.idx64-flags }} \ ${{ matrix.c-lapack-flags }} \ + ${{ matrix.no-avx512-flags }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ .. @@ -264,12 +293,33 @@ jobs: continue-on-error: true run: ccache -s + - name: Save ccache + # Save the cache after we are done (successfully) building + uses: actions/cache/save@v3 + with: + path: ${{ steps.ccache-prepare.outputs.ccachedir }} + key: ${{ steps.ccache-prepare.outputs.key }} + - name: Run tests + id: run-ctest timeout-minutes: 60 run: cd build && ctest + - name: Re-run tests + if: always() && (steps.run-ctest.outcome == 'failure') + timeout-minutes: 60 + run: | + cd build + echo "::group::Re-run ctest" + ctest --rerun-failed --output-on-failure || true + echo "::endgroup::" + echo "::group::Log from these tests" + [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log + echo "::endgroup::" + cross_build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-22.04 strategy: @@ -295,6 +345,7 @@ jobs: - name: Install Dependencies run: | + sudo apt-get update sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross - name: Compilation cache diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml new file mode 100644 index 000000000..f1bf8064c --- /dev/null +++ b/.github/workflows/loongarch64.yml @@ -0,0 +1,119 @@ +name: loongarch64 qemu test + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: LOONGSONGENERIC + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC + - target: LOONGSON3R5 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 + - target: LOONGSON2K1000 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 + - target: DYNAMIC_ARCH + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Install APT deps + run: | + sudo add-apt-repository ppa:savoury1/virtualisation + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + qemu-user-static + + - name: Download and install loongarch64-toolchain + run: | + wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt + + - name: Set env + run: | + echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Disable utest dsdot:dsdot_n_1 + run: | + echo -n > utest/test_dsdot.c + echo "Due to the qemu versions 7.2 causing utest cases to fail," + echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." + + - name: Build OpenBLAS + run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: Test + run: | + qemu-loongarch64-static ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index de7c0c0f3..4686ba713 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -2,11 +2,16 @@ name: mips64 qemu test on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: contents: read # to fetch code (actions/checkout) jobs: TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-latest strategy: fail-fast: false diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 96063565d..ca57fba70 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -18,11 +18,16 @@ on: name: Nightly-Homebrew-Build +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: contents: read # to fetch code (actions/checkout) jobs: build-OpenBLAS-with-Homebrew: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: macos-latest env: DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml new file mode 100644 index 000000000..dd6fe9ca8 --- /dev/null +++ b/.github/workflows/riscv64_vector.yml @@ -0,0 +1,253 @@ +name: riscv64 zvl256b qemu test + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-latest + env: + triple: riscv64-unknown-linux-gnu + riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain + riscv_gnu_toolchain_version: 13.2.0 + riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz + strategy: + fail-fast: false + matrix: + include: + - target: RISCV64_ZVL128B + opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64 + qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64 + - target: RISCV64_ZVL256B + opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 + qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make \ + libgomp1-riscv64-cross ccache + wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path} + tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS libs + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) + + - name: build OpenBLAS tests + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='${triple}-gcc' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests + + - name: build lapack-netlib tests + working-directory: ./lapack-netlib/TESTING + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='${triple}-gcc' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \ + LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \ + LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \ + LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \ + + - name: OpenBLAS tests + shell: bash + run: | + export PATH="/opt/riscv/bin:$PATH" + export QEMU_CPU=${{ matrix.qemu_cpu }} + rm -rf ./test_out + mkdir -p ./test_out + run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \ + echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \ + if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ + else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \ + RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \ + } + run_test test cblat1 & + run_test test cblat2 cblat2.dat & + run_test test cblat3 cblat3.dat & + run_test test dblat1 & + run_test test dblat2 dblat2.dat & + run_test test dblat3 dblat3.dat & + run_test test sblat1 & + run_test test sblat2 sblat2.dat & + run_test test sblat3 sblat3.dat & + run_test test zblat1 & + run_test test zblat2 zblat2.dat & + run_test test zblat3 zblat3.dat & + run_test ctest xccblat1 & + run_test ctest xccblat2 cin2 & + run_test ctest xccblat3 cin3 & + run_test ctest xdcblat1 & + run_test ctest xdcblat2 din2 & + run_test ctest xdcblat3 din3 & + run_test ctest xscblat1 & + run_test ctest xscblat2 sin2 & + run_test ctest xscblat3 sin3 & + run_test ctest xzcblat1 & + run_test ctest xzcblat2 zin2 & + run_test ctest xzcblat3 zin3 & + wait + while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) + if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi + + - name: netlib tests + shell: bash + run: | + : # these take a very long time + echo "Skipping netlib tests in CI" + exit 0 + : # comment out exit above to enable the tests + : # probably we want to identify a subset to run in CI + export PATH="/opt/riscv/bin:$PATH" + export QEMU_CPU=${{ matrix.qemu_cpu }} + rm -rf ./test_out + mkdir -p ./test_out + run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \ + echo "$4" >> $OUTPUT; \ + echo "$CMD" >> $OUTPUT; \ + qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \ + RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \ + if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \ + if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \ + } + run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" & + run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" & + run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" & + run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" & + run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" & + run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" & + run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" & + run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" & + run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" & + run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" & + run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" & + run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" & + run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" & + run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" & + run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" & + run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" & + run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" & + run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" & + run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" & + run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" & + run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" & + run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" & + run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" & + run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" & + run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" & + run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" & + run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" & + run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" & + run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" & + run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" & + run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" & + run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & + run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" & + run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" & + run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" & + run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" & + run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" & + run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" & + run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" & + run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" & + run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" & + run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" & + run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" & + run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" & + run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" & + run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" & + run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" & + run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" & + run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" & + run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" & + run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" & + run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" & + run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" & + run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" & + run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" & + run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" & + run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" & + run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" & + run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" & + run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & + run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" & + run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" & + run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" & + run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" & + run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" & + run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" & + wait + while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) + python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary + TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)" + NUMERICAL_ERRORS=-1 + OTHER_ERRORS=-1 + . <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary + if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi + if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi diff --git a/.gitignore b/.gitignore index 0fe20ecaa..9fa6a2c99 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ lapack-3.4.2 lapack-3.4.2.tgz lapack-netlib/make.inc lapack-netlib/lapacke/include/lapacke_mangling.h +lapack-netlib/SRC/la_constants.mod lapack-netlib/TESTING/testing_results.txt lapack-netlib/INSTALL/test* lapack-netlib/TESTING/xeigtstc @@ -46,6 +47,7 @@ config_last.h getarch getarch_2nd utest/openblas_utest +utest/openblas_utest_ext ctest/xccblat1 ctest/xccblat2 ctest/xccblat3 @@ -71,6 +73,7 @@ test/SBLAT3.SUMM test/ZBLAT2.SUMM test/ZBLAT3.SUMM test/SHBLAT3.SUMM +test/SBBLAT3.SUMM test/cblat1 test/cblat2 test/cblat3 @@ -81,6 +84,7 @@ test/sblat1 test/sblat2 test/sblat3 test/test_shgemm +test/test_sbgemm test/zblat1 test/zblat2 test/zblat3 diff --git a/CMakeLists.txt b/CMakeLists.txt index 502bf7a9d..9fbe878e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 23) +set(OpenBLAS_PATCH_VERSION 26.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") @@ -20,8 +20,12 @@ include(CMakePackageConfigHelpers) ####### option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) +option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) + option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) +option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) + option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) @@ -38,6 +42,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON) +option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF) + +set(LIBNAMEPREFIX "" CACHE STRING "Add a prefix to the openblas part of the library name" ) +set(LIBNAMESUFFIX "" CACHE STRING "Add a suffix after the openblas part of the library name" ) + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() @@ -94,7 +103,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") -set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) +set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) set(BLASDIRS interface driver/level2 driver/level3 driver/others) @@ -247,20 +256,21 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago endif() endif() -if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) +# Seems that this hack doesn't required since macOS 11 Big Sur +if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) if (NOT NOFORTRAN) set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_Fortran_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " - "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " - "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() @@ -309,29 +319,36 @@ endif() #if (MSVC OR NOT NOFORTRAN) if (NOT NO_CBLAS) + if (NOT ONLY_CBLAS) # Broken without fortran on unix - add_subdirectory(utest) + add_subdirectory(utest) +endif() endif() if (NOT NOFORTRAN) + if (NOT ONLY_CBLAS) # Build test and ctest add_subdirectory(test) - if (BUILD_TESTING) + endif() + if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK) add_subdirectory(lapack-netlib/TESTING) endif() endif() if(NOT NO_CBLAS) + if (NOT ONLY_CBLAS) add_subdirectory(ctest) + endif() endif() if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) add_subdirectory(cpp_thread_test) endif() +if (NOT FIXED_LIBNAME) set_target_properties(${OpenBLAS_LIBS} PROPERTIES VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION} ) - +endif() if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (NOT MSVC) target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") @@ -398,21 +415,106 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") endif() + if (${BUILD_LAPACK_DEPRECATED}) + set (BLD 1) + else () + set (BLD 0) + endif() + if (${BUILD_BFLOAT16}) + set (BBF16 1) + else () + set (BBF16 0) + endif() + if (${BUILD_SINGLE}) + set (BS 1) + else () + set (BS 0) + endif() + if (${BUILD_DOUBLE}) + set (BD 1) + else () + set (BD 0) + endif() + if (${BUILD_COMPLEX}) + set (BC 1) + else () + set (BC 0) + endif() + if (${BUILD_COMPLEX16}) + set (BZ 1) + else () + set (BZ 0) + endif() if (NOT USE_PERL) add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) else() add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) endif() endif() +if (BUILD_BENCHMARKS) + #find_package(OpenMP REQUIRED) + file(GLOB SOURCES "benchmark/*.c") + if (NOT USE_OPENMP) + file(GLOB REMFILE "benchmark/smallscaling.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + if (BUILD_WITHOUT_LAPACK) + file(GLOB REMFILE "benchmark/cholesky.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/geev.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/gesv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/getri.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/potrf.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/spmv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/symv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/linpack.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + if (NOT USE_GEMM3M) + file(GLOB REMFILE "benchmark/gemm3m.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + foreach(source ${SOURCES}) + get_filename_component(name ${source} NAME_WE) + if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) + set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE") + foreach(define ${defines}) + set(target_name "benchmark_${name}") + if (NOT "${define}" STREQUAL "DEFAULT") + string(JOIN "_" define_str ${define}) + set(target_name "${target_name}_${define_str}") + endif() + if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) + add_executable(${target_name} ${source}) + target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} ) + # target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) + if (NOT "${define}" STREQUAL "DEFAULT") + target_compile_definitions(${target_name} PRIVATE ${define}) + endif() + endif() + endforeach() + endif() + endforeach() +endif() # Install project @@ -503,7 +605,7 @@ if(NOT NO_LAPACKE) ADD_CUSTOM_TARGET(genlapacke COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() # Install pkg-config files @@ -511,9 +613,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/ install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) -# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(PN OpenBLAS) -set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") +set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}") configure_package_config_file(cmake/${PN}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f5e9dda91..203320826 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -23,6 +23,9 @@ * Optimization on AMD Piledriver * Optimization on Intel Haswell +* Chris Sidebottom + * Optimizations and other improvements targeting AArch64 + ## Previous Developers * Zaheer Chothia @@ -212,4 +215,11 @@ In chronological order: * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. * Pablo Romero - * [2022-08] Fix building from sources for QNX \ No newline at end of file + * [2022-08] Fix building from sources for QNX + +* Mark Seminatore + * [2023-11-09] Improve Windows threading performance scaling + * [2024-02-09] Introduce MT_TRACE facility and improve code consistency + +* Dirreke + * [2024-01-16] Add basic support for the CSKY architecture diff --git a/Changelog.txt b/Changelog.txt index aa445ae82..b6139d6b7 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,195 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.26 + 2-Jan-2024 + +general: +- improved the version of openblas.pc that is created by the CMAKE build +- fixed a CMAKE-specific build problem on older versions of MacOS +- worked around linking problems on old versions of MacOS +- corrected installation location of the lapacke_mangling header in CMAKE builds +- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header +- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading +- imported additions and corrections from the Reference-LAPACK project: + - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941) + - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942) + - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959) + - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967) + - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736) + - fixed several memory leaks in the LAPACK testsuite (PR 953) + - fixed counting of testsuite results by the Python script (PR 954) + +x86-64: +- fixed computation of CASUM on SkylakeX and newer targets in the special + case that AVX512 is not supported by the compiler or operating environment +- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets +- worked around a problem in the pre-AVX kernels for GEMV +- sped up the thread management code on MS Windows + +arm64: +- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer +- sped up the thread management code on MS Windows +- sped up SGEMM and DGEMM on Neoverse V1 and N1 +- sped up ?DOT on SVE-capable targets +- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones +- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds + +power: +- improved the SGEMM kernel for POWER10 +- fixed compilation with (very) old versions of gcc +- fixed detection of old 32bit PPC targets in CMAKE-based builds +- added autodetection of the POWERPC 7400 subtype +- fixed CMAKE-based compilation for PPCG4 and PPC970 targets + +loongarch64: +- added and improved optimized kernels for almost all BLAS functions + +==================================================================== +Version 0.3.25 + 12-Nov-2023 + +general: +- improved the error message shown on exceeding the maximum thread count +- improved the code to add supplementary thread buffers in case of overflow +- fixed a potential division by zero in ?ROTG +- improved the ?MATCOPY functions to accept zero-sized rows or columns +- corrected empty prototypes in function declarations +- cleaned up unused declarations in the f2c-converted versions of the LAPACK sources +- fixed compilation with the Cray CCE Compiler suite +- improved link line rewriting to avoid mixed libgomp/libomp builds with clang&gfortran +- worked around OPENMP builds with LLVM14's libomp hanging on FreeBSD +- improved the Makefiles to require less option duplication on "make install" +- imported the following changes from the upcoming release 3.12 of Reference-LAPACK + - deprecate utility functions ?GELQS and ?GEQRS (LAPACK PR 900) + - apply rounding up to workspace calculations done in floating point (LAPACK PR 904) + - avoid overflow in STGEX2/DTGEX2 (LAPACK PR 907) + - fix accumulation in ?LASSQ (LAPACK PR 909) + - fix handling of NaN values in ?GECON (LAPACK PR 926) + - avoid overflow in CBDSQR/ZBDSQR (LAPACK PR 927) + - fix poor vector orthogonalizations in ?ORBDB5/?UNBDB5 (LAPACK PR 928 & 930) + +x86-64: +- fixed compile-time autodetection of AMD Ryzen3 and Ryzen4 cpus +- fixed capability-based fallback selection for unknown cpus in DYNAMIC_ARCH +- added AVX512 optimizations for ?ASUM on Sapphire Rapids and Cooper Lake + +ARM64: +- fixed building on Apple with homebrew gcc +- fixed building with XCODE 15 +- fixed building on A64FX and Cortex A710/X1/X2 +- increased the default buffer size for recent ARM server cpus + +POWER: +- fixed building with the IBM xlf 16.1.1 compiler +- fixed building with IBM XL C +- added support for DYNAMIC_ARCH builds with clang +- fixed union declaration in the BFLOAT16 test case +- enable optimizations for the AIX assembler on POWER10 + +LOONGARCH64: +- added an optimized SGEMV kernel +- added an optimized DTRSM kernel + +==================================================================== +Version 0.3.24 + 03-Sep-2023 + +general: + - declared the arguments of cblas_xerbla as const (in accordance with the reference implementation + and others, the previous discrepancy appears to have dated back to GotoBLAS) + - fixed the implementation of ?GEMMT that was added in 0.3.23 + - made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds + - fixed application of SYMBOLSUFFIX in CMAKE builds + - fixed missing SSYCONVF function in the shared library + - fixed parallel build logic used with gmake + - added support for compilation with LLVM17, in particular its new Fortran compiler + - added support for CMAKE builds using the NVIDIA HPC compiler + - fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler + - fixed cross-build detection and management in c_check + - disabled building of the tests with CMAKE when ONLY_CBLAS is defined + - fixed several issues with the handling of runtime limits on the number of OPENMP threads + - corrected the error code returned by SGEADD/DGEADD when LDA is too small + - corrected the error code returned by IMATCOPY when LDB is too small + - updated ?NRM2 to support negative increment values (as introduced in release 3.10 + of the reference BLAS) + - fixed OpenMP builds with CLANG for the case where libomp is not in a standard location + - fixed a potential overwrite of unrelated memory during thread initialisation on startup + - fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK + - fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22 + - fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE + - applied additions and corrections from the development branch of Reference-LAPACK: + - fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885) + - fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883) + - fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878) + - fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876) + - added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839) + - corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867) + - removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860) + - updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852) + - fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855) + - fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849) + - added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736) + - fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854) + - applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847) + - removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832) + - fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836) + - added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837) + - updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831) + - improved algorithm description in ?GELSY (Reference-LAPACK PR 833) + - fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830) + - fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768) + - added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827) + - added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795) + - fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820) + - adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808) + +x86_64: + - added cpu model autodetection for Intel Alder Lake N + - added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel + - worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer + - fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG + - fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH + - fixed feature-based cputype fallback in DYNAMIC_ARCH + - added support for building the AVX512 kernels with the NVIDIA HPC compiler + - corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case + - fixed a potential use of uninitialized variables in ZTRSM + +ARM64: + - added cpu model autodetection for Apple M2 + - fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register) + - added support for building the SVE kernels with the NVIDIA HPC compiler + - added support for building the SVE kernels with the Apple Clang compiler + - fixed compiler option handling for building the SVE kernels with LLVM + - implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse + - activated SVE SGEMM and DGEMM kernels for Neoverse V1 + - improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1 + - improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH + - fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or + container restrictions into account + - fixed a potential use of uninitialized variables in ZTRSM + - fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds + +LOONGARCH64: + - added ABI detection + - added support for cpu affinity handling + - fixed compilation with early versions of the Loongson toolchain + - added an optimized SGEMM kernel for 3A5000 + - added optimized DGEMV kernels for 3A5000 + - improved the performance of the DGEMM kernel for 3A5000 + +MIPS64: + - fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target + +POWER: + - fixed compiler warnings in the POWER10 SBGEMM kernel + +RISCV: + - fixed application of the INTERFACE64 option when building with CMAKE + - fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds + - fixed IDAMAX and DOT kernels for C910V + - fixed corner cases in the ROT and SWAP kernels for C910V + - fixed compilation of the C910V target with recent vendor compilers + ==================================================================== Version 0.3.23 01-Apr-2023 diff --git a/GotoBLAS_06WeirdPerformance.txt b/GotoBLAS_06WeirdPerformance.txt index 05766e17b..0f7cec5c9 100644 --- a/GotoBLAS_06WeirdPerformance.txt +++ b/GotoBLAS_06WeirdPerformance.txt @@ -11,7 +11,7 @@ operation is finished. -2. Simlar problem may happen under virtual machine. If supervisor +2. Similar problem may happen under virtual machine. If supervisor allocates different cores for each scheduling, BLAS performnace will be bad. This is because BLAS also utilizes all cache, unexpected re-schedule for different core may result of heavy diff --git a/Jenkinsfile b/Jenkinsfile index 2b61bed9f..baeeee59f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,9 +1,14 @@ -node { - stage('Checkout') { - checkout +pipeline { + agent { + docker { + image 'osuosl/ubuntu-s390x' } - + } + stages { stage('Build') { - sh("make") + steps { + sh 'make clean && make' + } } + } } diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr new file mode 100644 index 000000000..96e18b8ad --- /dev/null +++ b/Jenkinsfile.pwr @@ -0,0 +1,16 @@ +pipeline { + agent { + docker { + image 'osuosl/ubuntu-ppc64le' + } + } + stages { + stage('Build') { + steps { + sh 'sudo apt update' + sh 'sudo apt install gfortran -y' + sh 'make clean && make' + } + } + } +} diff --git a/Makefile b/Makefile index 144b3400d..c04074795 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,9 @@ TOPDIR = . include ./Makefile.system +LNCMD = ln -fs +ifeq ($(FIXED_LIBNAME), 1) +LNCMD = true +endif BLASDIRS = interface driver/level2 driver/level3 driver/others @@ -35,14 +39,18 @@ export NO_LAPACK export C_LAPACK endif +ifeq ($(F_COMPILER),CRAY) +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS)) +else LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) +endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test .PHONY : all libs netlib $(RELA) test ctest shared install -.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test +.NOTPARALLEL : shared -all :: libs netlib $(RELA) tests shared +all :: tests @echo @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" @echo @@ -130,17 +138,17 @@ shared : libs netlib $(RELA) ifneq ($(NO_SHARED), 1) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) + @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so + @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so + @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn - @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib - @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib + @$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib + @$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll @@ -148,9 +156,12 @@ endif ifeq ($(OSNAME), CYGWIN_NT) @$(MAKE) -C exports dll endif +ifeq ($(OSNAME), AIX) + @$(MAKE) -C exports so +endif endif -tests : libs netlib $(RELA) shared +tests : shared ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS @@ -206,16 +217,32 @@ ifeq ($(DYNAMIC_OLDER), 1) @echo DYNAMIC_OLDER=1 >> Makefile.conf_last endif endif + @echo TARGET=$(CORE) >> Makefile.conf_last ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last endif - @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +ifdef SMP +ifdef NUM_THREADS + @echo NUM_THREADS=$(NUM_THREADS) >> Makefile.conf_last +else + @echo NUM_THREADS=$(NUM_CORES) >> Makefile.conf_last +endif +endif +ifeq ($(USE_OPENMP),1) + @echo USE_OPENMP=1 >> Makefile.conf_last +endif +ifeq ($(INTERFACE64),1) + @echo INTERFACE64=1 >> Makefile.conf_last +endif + @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last + @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last + @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @touch lib.grd prof : prof_blas prof_lapack prof_blas : - ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) + $(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d prof || exit 1 ; \ @@ -226,7 +253,7 @@ ifeq ($(DYNAMIC_ARCH), 1) endif blas : - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d libs || exit 1 ; \ @@ -234,7 +261,7 @@ blas : done hpl : - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -248,7 +275,7 @@ ifeq ($(DYNAMIC_ARCH), 1) endif hpl_p : - ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) + $(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -373,14 +400,15 @@ ifneq ($(CROSS), 1) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) endif -lapack-runtest: +lapack-runtest: lapack-test ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) - (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) + (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING ) blas-test: (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) diff --git a/Makefile.L3 b/Makefile.L3 deleted file mode 100644 index 76586d826..000000000 --- a/Makefile.L3 +++ /dev/null @@ -1,5089 +0,0 @@ -USE_GEMM3M = 0 -OS := $(shell uname) - -ifeq ($(ARCH), x86) -USE_GEMM3M = 1 -endif - -ifeq ($(ARCH), x86_64) -USE_GEMM3M = 1 -endif - -ifeq ($(ARCH), x86_64) -USE_DIRECT_SGEMM = 1 -endif - -ifeq ($(ARCH), ia64) -USE_GEMM3M = 1 -endif - -ifeq ($(ARCH), arm) -USE_TRMM = 1 -endif - -ifeq ($(ARCH), arm64) -USE_TRMM = 1 -endif - -ifeq ($(ARCH), riscv64) -USE_TRMM = 1 -endif - -ifneq ($(DYNAMIC_ARCH), 1) -ifeq ($(TARGET), GENERIC) -USE_TRMM = 1 -endif -endif - -ifeq ($(CORE), HASWELL) -USE_TRMM = 1 -endif - -ifeq ($(CORE), SKYLAKEX) -USE_TRMM = 1 -endif - -ifeq ($(CORE), COOPERLAKE) -USE_TRMM = 1 -endif - -ifeq ($(CORE), SAPPHIRERAPIDS) -USE_TRMM = 1 -endif - -ifeq ($(CORE), ZEN) -USE_TRMM = 1 -endif - -ifeq ($(CORE), POWER8) -ifeq ($(BINARY64),1) -USE_TRMM = 1 -endif -endif - -ifeq ($(CORE), POWER9) -USE_TRMM = 1 -endif - -ifeq ($(CORE), POWER10) -USE_TRMM = 1 -endif - -ifeq ($(ARCH), zarch) -USE_TRMM = 1 -endif - -ifeq ($(CORE), Z14) -USE_TRMM = 1 -endif - -ifdef USE_DIRECT_SGEMM -ifndef SGEMMDIRECTKERNEL -SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c -SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c -endif -endif - -ifeq ($(BUILD_BFLOAT16), 1) -ifndef SBGEMMKERNEL -SBGEMM_BETA = ../generic/gemm_beta.c -SBGEMMKERNEL = ../generic/gemmkernel_2x2.c -SBGEMMINCOPY = ../generic/gemm_ncopy_2.c -SBGEMMITCOPY = ../generic/gemm_tcopy_2.c -SBGEMMONCOPY = ../generic/gemm_ncopy_2.c -SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) -SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) -SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) -SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) -endif - -SBKERNELOBJS += \ - sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ - $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) -endif - -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" -SKERNELOBJS += \ - sgemm_kernel$(TSUFFIX).$(SUFFIX) \ - sgemm_beta$(TSUFFIX).$(SUFFIX) \ - $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ - $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) - -ifdef USE_DIRECT_SGEMM -SKERNELOBJS += \ - sgemm_direct$(TSUFFIX).$(SUFFIX) \ - sgemm_direct_performant$(TSUFFIX).$(SUFFIX) -endif -endif - -ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" -DKERNELOBJS += \ - dgemm_beta$(TSUFFIX).$(SUFFIX) \ - dgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ - $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) -endif - -QKERNELOBJS += \ - qgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \ - $(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ) - -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CKERNELOBJS += \ - cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ - cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ - $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ - $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZKERNELOBJS += \ - zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ - zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ - $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ - $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) -endif - -XKERNELOBJS += \ - xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ - xgemm_kernel_l$(TSUFFIX).$(SUFFIX) xgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ - $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ - $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) - -ifeq ($(BUILD_BFLOAT16),1) -SBBLASOBJS += $(SBKERNELOBJS) -endif -SBLASOBJS += $(SKERNELOBJS) -DBLASOBJS += $(DKERNELOBJS) -QBLASOBJS += $(QKERNELOBJS) -CBLASOBJS += $(CKERNELOBJS) -ZBLASOBJS += $(ZKERNELOBJS) -XBLASOBJS += $(XKERNELOBJS) - -ifeq ($(BUILD_BFLOAT16),1) -SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) -endif - -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" -SBLASOBJS += \ - sgemm_beta$(TSUFFIX).$(SUFFIX) \ - strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) -endif - -ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" -DBLASOBJS += \ - dgemm_beta$(TSUFFIX).$(SUFFIX) \ - dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) -endif - -QBLASOBJS += \ - qgemm_beta$(TSUFFIX).$(SUFFIX) \ - qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) - -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CBLASOBJS += \ - ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) -endif -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CBLASOBJS += \ - cgemm_beta$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZBLASOBJS += \ - zgemm_beta$(TSUFFIX).$(SUFFIX) \ - ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) -endif - -XBLASOBJS += \ - xgemm_beta$(TSUFFIX).$(SUFFIX) \ - xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) - -ifeq ($(USE_GEMM3M), 1) - -CBLASOBJS += cgemm3m_kernel$(TSUFFIX).$(SUFFIX) -ZBLASOBJS += zgemm3m_kernel$(TSUFFIX).$(SUFFIX) -XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) - -endif - -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" -SBLASOBJS += \ - strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ - strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - strsm_iltucopy$(TSUFFIX).$(SUFFIX) strsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ - strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_DOUBLE),1) -DBLASOBJS += \ - dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) -endif - -QBLASOBJS += \ - qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_outucopy$(TSUFFIX).$(SUFFIX) qtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) - -ifeq ($(BUILD_COMPLEX),1) -CBLASOBJS += \ - ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ - chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ - chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) -endif -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CBLASOBJS += \ - ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZBLASOBJS += \ - ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_outucopy$(TSUFFIX).$(SUFFIX) ztrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - zsymm_iutcopy$(TSUFFIX).$(SUFFIX) zsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ - zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ - zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX) -endif - -XBLASOBJS += \ - xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_outucopy$(TSUFFIX).$(SUFFIX) xtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_outucopy$(TSUFFIX).$(SUFFIX) xtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - xsymm_iutcopy$(TSUFFIX).$(SUFFIX) xsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - xsymm_outcopy$(TSUFFIX).$(SUFFIX) xsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ - xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ - xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX) - -ifeq ($(USE_GEMM3M), 1) - -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CBLASOBJS += \ - cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ - cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ - cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ - cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ - cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ - cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ - csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ - chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZBLASOBJS += \ - zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ - zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ - zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ - zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ - zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ - zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ - zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ - zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) -endif - -XBLASOBJS += \ - xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ - xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ - xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ - xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ - xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ - xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ - xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ - xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) - -endif - -###### BLAS small matrix optimization ##### -ifeq ($(SMALL_MATRIX_OPT), 1) - -ifeq ($(BUILD_BFLOAT16),1) -SBBLASOBJS += \ - sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) -endif - -SBLASOBJS += \ - sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) - -DBLASOBJS += \ - dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) - -CBLASOBJS += \ - cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) - -ZBLASOBJS += \ - zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) - -endif - -###### BLAS extensions ##### - -ifeq ($(BUILD_SINGLE),1) -SBLASOBJS += \ - somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - sgeadd_k$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(BUILD_DOUBLE),1) -DBLASOBJS += \ - domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - dgeadd_k$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX),1) -CBLASOBJS += \ - comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ - cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ - cgeadd_k$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZBLASOBJS += \ - zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ - zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ - zgeadd_k$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_BFLOAT16), 1) -SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -endif - -SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SGEMMOTCOPYOBJ_P = $(SGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -DGEMMINCOPYOBJ_P = $(DGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -DGEMMITCOPYOBJ_P = $(DGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -DGEMMONCOPYOBJ_P = $(DGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -DGEMMOTCOPYOBJ_P = $(DGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -QGEMMINCOPYOBJ_P = $(QGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -QGEMMITCOPYOBJ_P = $(QGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -QGEMMONCOPYOBJ_P = $(QGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -QGEMMOTCOPYOBJ_P = $(QGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -CGEMMINCOPYOBJ_P = $(CGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -CGEMMITCOPYOBJ_P = $(CGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -CGEMMONCOPYOBJ_P = $(CGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -CGEMMOTCOPYOBJ_P = $(CGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ZGEMMINCOPYOBJ_P = $(ZGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ZGEMMITCOPYOBJ_P = $(ZGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ZGEMMONCOPYOBJ_P = $(ZGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ZGEMMOTCOPYOBJ_P = $(ZGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -XGEMMINCOPYOBJ_P = $(XGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) - -ifeq ($(BUILD_BFLOAT16),1) -$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)qgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)cgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ - -ifeq ($(ARCH), E2K) -USE_TRMM = 1 -endif - - -ifeq ($(BUILD_BFLOAT16), 1) - -$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) - -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s - m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ - rm sbgemmotcopy.s sbgemmotcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) - -$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s - m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ - rm sbgemmitcopy.s sbgemmitcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -endif -endif - -$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s - m4 sgemmotcopy.s > sgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ - rm sgemmotcopy.s sgemmotcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - - -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) - -$(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s - m4 sgemmitcopy.s > sgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ - rm sgemmitcopy.s sgemmitcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - -endif - -$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s - m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ - rm dgemm_ncopy.s dgemm_ncopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -$(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s - m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ - rm dgemm_itcopy.s dgemm_itcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -endif - -endif - -ifdef EXPRECISION - -$(KDIR)$(QGEMMONCOPYOBJ) : $(KERNELDIR)/$(QGEMMONCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(QGEMMOTCOPYOBJ) : $(KERNELDIR)/$(QGEMMOTCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) - -$(KDIR)$(QGEMMINCOPYOBJ) : $(KERNELDIR)/$(QGEMMINCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(QGEMMITCOPYOBJ) : $(KERNELDIR)/$(QGEMMITCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -endif - -endif - -$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) - -$(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s - m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ - rm cgemm_itcopy.s cgemm_itcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - -endif - -$(KDIR)$(ZGEMMONCOPYOBJ) : $(KERNELDIR)/$(ZGEMMONCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(ZGEMMOTCOPYOBJ) : $(KERNELDIR)/$(ZGEMMOTCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) - -$(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s - m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ - rm zgemm_itcopy.s zgemm_itcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -endif - -endif - -ifdef EXPRECISION - -$(KDIR)$(XGEMMONCOPYOBJ) : $(KERNELDIR)/$(XGEMMONCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(XGEMMOTCOPYOBJ) : $(KERNELDIR)/$(XGEMMOTCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) - -$(KDIR)$(XGEMMINCOPYOBJ) : $(KERNELDIR)/$(XGEMMINCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(XGEMMITCOPYOBJ) : $(KERNELDIR)/$(XGEMMITCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -endif - -endif - -$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s - m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - -ifdef USE_DIRECT_SGEMM -$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - -ifeq ($(BUILD_BFLOAT16), 1) - -$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s - m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s -else - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif -endif - -$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s - m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s -else - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s - m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ - rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ -endif - -$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s - m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ - rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ -endif - -$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ - rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ -endif - -$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s - m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ - rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ -endif - -$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s - m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ - rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s -else ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ -endif - -$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s - m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ - rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s -else ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ -endif - -$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s - m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ - rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s -else ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ -endif - -$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s - m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ - rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s -else ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ -else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ -endif - -$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)xgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ - - -ifdef USE_TRMM -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s - m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ - rm strmmkernel_ln.s strmmkernel_ln_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s - m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ - rm strmmkernel_lt.s strmmkernel_lt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s - m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ - rm strmmkernel_rn.s strmmkernel_rn_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ - rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s - m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ - rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s - m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ - rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s - m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ - rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s - m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ - rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s - m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ - rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s - m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ - rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s - m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ - rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s - m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ - rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s - m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ - rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s - m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ - rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s - m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ - rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -endif - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s - m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ - rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s - m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ - rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s - m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ - rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s - m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ - rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s - m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ - rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s - m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ - rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s - m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ - rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s - m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ - rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -endif - -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s - m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ - rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - -else -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ - rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -endif -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -endif -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -endif -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif -endif - - - - -$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)zgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)xgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)strsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s - m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ - rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s -else - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ -endif - -$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - - -ifdef STRMMUNCOPY_M -$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef STRMMLNCOPY_M -$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef STRMMUTCOPY_M -$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef STRMMLTCOPY_M -$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef DTRMMUNCOPY_M -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef DTRMMLNCOPY_M -$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef DTRMMUTCOPY_M -$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef DTRMMLTCOPY_M -$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef CTRMMUNCOPY_M -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef CTRMMLNCOPY_M -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef CTRMMUTCOPY_M -$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef CTRMMLTCOPY_M -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef ZTRMMUNCOPY_M -$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRMMLNCOPY_M -$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef ZTRMMUTCOPY_M -$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRMMLTCOPY_M -$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -ifdef SSYMMUCOPY_M -$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ -else -$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ -endif - -ifdef SSYMMLCOPY_M -$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ -else -$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ -endif - -$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -ifdef DSYMMUCOPY_M -$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ -else -$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ -endif - -ifdef DSYMMLCOPY_M -$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ -else -$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ -endif - -$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)qsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)qsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)qsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -ifdef CSYMMUCOPY_M -$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ -else -$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ -endif - -ifdef CSYMMLCOPY_M -$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ -else -$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ -endif - -$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -ifdef ZSYMMUCOPY_M -$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ -else -$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ -endif - -ifdef ZSYMMLCOPY_M -$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ -else -$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ -endif - -$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)xsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)xsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)xsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -ifdef CHEMMUTCOPY_M -$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ -else -$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ -endif - -ifdef CHEMMLTCOPY_M -$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ -else -$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ -endif - -$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -ifdef ZHEMMUTCOPY_M -$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ -else -$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ -endif - -ifdef ZHEMMLTCOPY_M -$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ -else -$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ -endif - -$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)xhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -$(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ - -$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ - -$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -ifdef TRSMCOPYUN_M -$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYLN_M -$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYUT_M -$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYLT_M -$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef TRSMCOPYUN_M -$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYLN_M -$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYUT_M -$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYLT_M -$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef ZTRSMCOPYUN_M -$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYLN_M -$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYUT_M -$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYLT_M -$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef ZTRSMCOPYUN_M -$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYLN_M -$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYUT_M -$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYLT_M -$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - - -$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifeq ($(BUILD_BFLOAT16),1) -$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)qgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)cgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ - - -ifeq ($(BUILD_BFLOAT16), 1) -$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) -$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -endif -endif - -$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(SGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMOTCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) - -$(SGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMINCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -endif - -$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -$(DGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMINCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -endif - -ifdef EXPRECISION - -$(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(QGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) - -$(QGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMINCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(QGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMITCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -endif - -endif - -$(CGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMONCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(CGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMOTCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) - -$(CGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMINCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -endif - -$(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(ZGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) - -$(ZGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMINCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -endif - -ifdef EXPRECISION - -$(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(XGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) - -$(XGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMINCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(XGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMITCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -endif - -endif - - -ifeq ($(BUILD_BFLOAT16), 1) -$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)cgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ - rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ -endif - -$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ - -$(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)zgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ - -$(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)xgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ - -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ - rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)cgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)zgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)xgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)strsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - - -$(KDIR)strmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ssymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)ssymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)ssymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)ssymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)dsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)dsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)dsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)dsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)qsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)qsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)qsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)qsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)csymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)csymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)csymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)csymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)zsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)zsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)zsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)zsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)xsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)xsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)xsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)xsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)chemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)chemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -$(KDIR)chemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ - -$(KDIR)chemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ - -$(KDIR)zhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)zhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -$(KDIR)zhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ - -$(KDIR)zhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ - -$(KDIR)xhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)xhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -$(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ - -$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ - -$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - - -##### BLAS extensions ###### - -ifndef DOMATCOPY_CN -DOMATCOPY_CN = ../arm/omatcopy_cn.c -endif - -$(KDIR)domatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DOMATCOPY_RN -DOMATCOPY_RN = ../arm/omatcopy_rn.c -endif - -$(KDIR)domatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef DOMATCOPY_CT -DOMATCOPY_CT = ../arm/omatcopy_ct.c -endif - -$(KDIR)domatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DOMATCOPY_RT -DOMATCOPY_RT = ../arm/omatcopy_rt.c -endif - -$(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef DIMATCOPY_CN -DIMATCOPY_CN = ../generic/imatcopy_cn.c -endif - -$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DIMATCOPY_RN -DIMATCOPY_RN = ../generic/imatcopy_rn.c -endif - -$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef DIMATCOPY_CT -DIMATCOPY_CT = ../generic/imatcopy_ct.c -endif - -$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DIMATCOPY_RT -DIMATCOPY_RT = ../generic/imatcopy_rt.c -endif - -$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef SOMATCOPY_CN -SOMATCOPY_CN = ../arm/omatcopy_cn.c -endif - -$(KDIR)somatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef SOMATCOPY_RN -SOMATCOPY_RN = ../arm/omatcopy_rn.c -endif - -$(KDIR)somatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef SOMATCOPY_CT -SOMATCOPY_CT = ../arm/omatcopy_ct.c -endif - -$(KDIR)somatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef SOMATCOPY_RT -SOMATCOPY_RT = ../arm/omatcopy_rt.c -endif - -$(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef SIMATCOPY_CN -SIMATCOPY_CN = ../generic/imatcopy_cn.c -endif - -$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef SIMATCOPY_RN -SIMATCOPY_RN = ../generic/imatcopy_rn.c -endif - -$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef SIMATCOPY_CT -SIMATCOPY_CT = ../generic/imatcopy_ct.c -endif - -$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef SIMATCOPY_RT -SIMATCOPY_RT = ../generic/imatcopy_rt.c -endif - -$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ - - -ifndef COMATCOPY_CN -COMATCOPY_CN = ../arm/zomatcopy_cn.c -endif - -$(KDIR)comatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef COMATCOPY_RN -COMATCOPY_RN = ../arm/zomatcopy_rn.c -endif - -$(KDIR)comatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef COMATCOPY_CT -COMATCOPY_CT = ../arm/zomatcopy_ct.c -endif - -$(KDIR)comatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef COMATCOPY_RT -COMATCOPY_RT = ../arm/zomatcopy_rt.c -endif - -$(KDIR)comatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef COMATCOPY_CNC -COMATCOPY_CNC = ../arm/zomatcopy_cnc.c -endif - -$(KDIR)comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CNC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef COMATCOPY_RNC -COMATCOPY_RNC = ../arm/zomatcopy_rnc.c -endif - -$(KDIR)comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RNC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef COMATCOPY_CTC -COMATCOPY_CTC = ../arm/zomatcopy_ctc.c -endif - -$(KDIR)comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CTC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef COMATCOPY_RTC -COMATCOPY_RTC = ../arm/zomatcopy_rtc.c -endif - -$(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef CIMATCOPY_CN -CIMATCOPY_CN = ../generic/zimatcopy_cn.c -endif - -$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef CIMATCOPY_RN -CIMATCOPY_RN = ../generic/zimatcopy_rn.c -endif - -$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef CIMATCOPY_CT -CIMATCOPY_CT = ../generic/zimatcopy_ct.c -endif - -$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef CIMATCOPY_RT -CIMATCOPY_RT = ../generic/zimatcopy_rt.c -endif - -$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef CIMATCOPY_CNC -CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c -endif - -$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef CIMATCOPY_RNC -CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c -endif - -$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef CIMATCOPY_CTC -CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c -endif - -$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef CIMATCOPY_RTC -CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c -endif - -$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - - - -ifndef ZOMATCOPY_CN -ZOMATCOPY_CN = ../arm/zomatcopy_cn.c -endif - -$(KDIR)zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef ZOMATCOPY_RN -ZOMATCOPY_RN = ../arm/zomatcopy_rn.c -endif - -$(KDIR)zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef ZOMATCOPY_CT -ZOMATCOPY_CT = ../arm/zomatcopy_ct.c -endif - -$(KDIR)zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef ZOMATCOPY_RT -ZOMATCOPY_RT = ../arm/zomatcopy_rt.c -endif - -$(KDIR)zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef ZOMATCOPY_CNC -ZOMATCOPY_CNC = ../arm/zomatcopy_cnc.c -endif - -$(KDIR)zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CNC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef ZOMATCOPY_RNC -ZOMATCOPY_RNC = ../arm/zomatcopy_rnc.c -endif - -$(KDIR)zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RNC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef ZOMATCOPY_CTC -ZOMATCOPY_CTC = ../arm/zomatcopy_ctc.c -endif - -$(KDIR)zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CTC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef ZOMATCOPY_RTC -ZOMATCOPY_RTC = ../arm/zomatcopy_rtc.c -endif - -$(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef ZIMATCOPY_CN -ZIMATCOPY_CN = ../generic/zimatcopy_cn.c -endif - -$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef ZIMATCOPY_RN -ZIMATCOPY_RN = ../generic/zimatcopy_rn.c -endif - -$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef ZIMATCOPY_CT -ZIMATCOPY_CT = ../generic/zimatcopy_ct.c -endif - -$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef ZIMATCOPY_RT -ZIMATCOPY_RT = ../generic/zimatcopy_rt.c -endif - -$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef ZIMATCOPY_CNC -ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c -endif - -$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef ZIMATCOPY_RNC -ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c -endif - -$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef ZIMATCOPY_CTC -ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c -endif - -$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef ZIMATCOPY_RTC -ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c -endif - -$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - - -ifndef SGEADD_K -SGEADD_K = ../generic/geadd.c -endif - -$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DGEADD_K -DGEADD_K = ../generic/geadd.c -endif - -$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEADD_K) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef CGEADD_K -CGEADD_K = ../generic/zgeadd.c -endif - -$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@ - -ifndef ZGEADD_K -ZGEADD_K = ../generic/zgeadd.c -endif - -$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ - - - -###### BLAS small matrix optimization ##### - -ifndef DGEMM_SMALL_M_PERMIT -DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c -endif - -ifndef DGEMM_SMALL_K_NN -DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef DGEMM_SMALL_K_NT -DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef DGEMM_SMALL_K_TN -DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef DGEMM_SMALL_K_TT -DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifndef DGEMM_SMALL_K_B0_NN -DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef DGEMM_SMALL_K_B0_NT -DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef DGEMM_SMALL_K_B0_TN -DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef DGEMM_SMALL_K_B0_TT -DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ - -ifndef SGEMM_SMALL_M_PERMIT -SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c -endif - -ifndef SGEMM_SMALL_K_NN -SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef SGEMM_SMALL_K_NT -SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef SGEMM_SMALL_K_TN -SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef SGEMM_SMALL_K_TT -SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifndef SGEMM_SMALL_K_B0_NN -SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef SGEMM_SMALL_K_B0_NT -SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef SGEMM_SMALL_K_B0_TN -SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef SGEMM_SMALL_K_B0_TT -SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - - -ifeq ($(BUILD_BFLOAT16), 1) -ifndef SBGEMM_SMALL_M_PERMIT -SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c -endif - -ifndef SBGEMM_SMALL_K_NN -SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef SBGEMM_SMALL_K_NT -SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef SBGEMM_SMALL_K_TN -SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef SBGEMM_SMALL_K_TT -SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -ifndef SBGEMM_SMALL_K_B0_NN -SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef SBGEMM_SMALL_K_B0_NT -SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef SBGEMM_SMALL_K_B0_TN -SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef SBGEMM_SMALL_K_B0_TT -SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ -endif - -ifndef CGEMM_SMALL_M_PERMIT -CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c -endif - -ifndef CGEMM_SMALL_K_NN -CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c -endif - -ifndef CGEMM_SMALL_K_NT -CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c -endif - -ifndef CGEMM_SMALL_K_TN -CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c -endif - -ifndef CGEMM_SMALL_K_TT -CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c -endif - -$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ - -$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ - -$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ - -$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ - -$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ - -$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@ - -$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ - -$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ - -$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@ - -$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ - -$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ - -$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ - -$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ - -ifndef CGEMM_SMALL_K_B0_NN -CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c -endif - -ifndef CGEMM_SMALL_K_B0_NT -CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c -endif - -ifndef CGEMM_SMALL_K_B0_TN -CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c -endif - -ifndef CGEMM_SMALL_K_B0_TT -CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c -endif - -$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ - -ifndef ZGEMM_SMALL_M_PERMIT -ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c -endif - -ifndef ZGEMM_SMALL_K_NN -ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c -endif - -ifndef ZGEMM_SMALL_K_NT -ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c -endif - -ifndef ZGEMM_SMALL_K_TN -ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c -endif - -ifndef ZGEMM_SMALL_K_TT -ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c -endif - -$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ - - -$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ - -$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ - -$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ - -$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ - -$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ - -$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@ - -$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ - -$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ - -$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@ - -$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ - -$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ - -$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ - -$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ - -ifndef ZGEMM_SMALL_K_B0_NN -ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c -endif - -ifndef ZGEMM_SMALL_K_B0_NT -ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c -endif - -ifndef ZGEMM_SMALL_K_B0_TN -ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c -endif - -ifndef ZGEMM_SMALL_K_B0_TT -ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c -endif - -$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ diff --git a/Makefile.arm64 b/Makefile.arm64 index 064e84cbb..ca053b03d 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -69,7 +69,7 @@ endif # in GCC>=9 ifeq ($(CORE), NEOVERSEN1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 @@ -92,26 +92,37 @@ endif # in GCC>=10.4 ifeq ($(CORE), NEOVERSEV1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ10), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) -CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1 +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) +CCOMMON_OPT += -march=armv8.4-a+sve +ifeq (1, $(ISCLANG)) +CCOMMON_OPT += -mtune=cortex-x1 +else +CCOMMON_OPT += -mtune=neoverse-v1 +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif else -CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native +CCOMMON_OPT += -march=armv8.4-a+sve +ifneq ($(CROSS), 1) +CCOMMON_OPT += -mtune=native +endif ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.4-a -mtune=native +FCOMMON_OPT += -march=armv8.4-a +ifneq ($(CROSS), 1) +FCOMMON_OPT += -mtune=native +endif endif endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif @@ -122,30 +133,36 @@ endif # in GCC>=10.4 ifeq ($(CORE), NEOVERSEN2) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ10), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif else -CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native +CCOMMON_OPT += -march=armv8.5-a+sve +ifneq ($(CROSS), 1) +CCOMMON_OPT += -mtune=native +endif ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.5-a -mtune=native +FCOMMON_OPT += -march=armv8.5-a +ifneq ($(CROSS), 1) +FCOMMON_OPT += -mtune=native +endif endif endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif @@ -155,7 +172,7 @@ endif # Use a53 tunings because a55 is only available in GCC>=8.1 ifeq ($(CORE), CORTEXA55) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ8), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG))) CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 @@ -196,8 +213,13 @@ endif endif ifeq ($(CORE), THUNDERX3T110) -ifeq ($(GCCVERSIONGTEQ10), 1) -CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) +CCOMMON_OPT += -march=armv8.3-a +ifeq (0, $(ISCLANG)) +CCOMMON_OPT += -mtune=thunderx3t110 +else +CCOMMON_OPT += -mtune=thunderx2t99 +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 endif @@ -225,9 +247,12 @@ endif endif endif -ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) ifeq ($(CORE), EMAG8180) -CCOMMON_OPT += -march=armv8-a -mtune=emag +CCOMMON_OPT += -march=armv8-a +ifeq ($(ISCLANG), 0) +CCOMMON_OPT += -mtune=emag +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=emag endif diff --git a/Makefile.csky b/Makefile.csky new file mode 100644 index 000000000..36162af2a --- /dev/null +++ b/Makefile.csky @@ -0,0 +1,4 @@ +ifeq ($(CORE), CK860FV) +CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static +endif diff --git a/Makefile.install b/Makefile.install index 01899b970..b2bc5aa41 100644 --- a/Makefile.install +++ b/Makefile.install @@ -2,6 +2,18 @@ TOPDIR = . export GOTOBLAS_MAKEFILE = 1 -include $(TOPDIR)/Makefile.conf_last include ./Makefile.system +LNCMD = ln -fs + +ifdef THELIBNAME +LIBNAME=$(THELIBNAME) +LIBSONAME=$(THELIBSONAME) +endif +ifeq ($(FIXED_LIBNAME), 1) +LNCMD = true +endif +ifeq ($(INTERFACE64),1) +USE_64BITINT=1 +endif PREFIX ?= /opt/OpenBLAS @@ -91,7 +103,7 @@ ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifneq ($(NO_SHARED),1) @@ -99,21 +111,21 @@ ifneq ($(NO_SHARED),1) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \ + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib + $(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ + $(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" @@ -141,15 +153,15 @@ ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \ + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif endif @@ -162,6 +174,8 @@ endif @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" + @echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)" + @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" @@ -178,7 +192,7 @@ endif ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" diff --git a/Makefile.power b/Makefile.power index 28a0bae08..3fa6d6faf 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,11 +11,23 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) +ifeq ($(C_COMPILER), GCC) +ifeq ($(GCCVERSIONGTEQ10), 1) +CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +else ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math +else +$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended) +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math +endif +else CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +endif ifeq ($(F_COMPILER), IBM) -FCOMMON_OPT += -O2 -qrecur -qnosave +FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize else -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif endif endif @@ -38,19 +50,18 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(F_COMPILER), IBM) -FCOMMON_OPT += -O2 -qrecur -qnosave +FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr9 -qtune=pwr9 -qfloat=nomaf -qzerosize else -FCOMMON_OPT += -O2 -frecursive -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -fno-fast-math -mcpu=power9 -mtune=power9 endif -ifeq ($(C_COMPILER), GCC) + +ifeq ($(F_COMPILER), GFORTRAN) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) FCOMMON_OPT += -mcpu=power8 -mtune=power8 else FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif -else -FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else FCOMMON_OPT += -O2 -Mrecursive @@ -66,12 +77,16 @@ endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) ifeq ($(F_COMPILER), IBM) -FCOMMON_OPT += -O2 -qrecur -qnosave +FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize else -FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif else -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize +else +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +endif endif else FCOMMON_OPT += -O2 -Mrecursive @@ -84,13 +99,20 @@ CCOMMON_OPT += -DUSE_OPENMP -fopenmp else CCOMMON_OPT += -DUSE_OPENMP -mp endif +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -DUSE_OPENMP +else ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -DUSE_OPENMP -fopenmp else FCOMMON_OPT += -DUSE_OPENMP -mp endif endif +endif +ifeq ($(C_COMPILER), CLANG) +CCOMMON_OPT += -fno-integrated-as +endif # workaround for C->FORTRAN ABI violation in LAPACKE ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -fno-optimize-sibling-calls @@ -125,8 +147,19 @@ endif ifdef BINARY64 +ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), GCCIBMAIX) +$(error Using GCC and XLF on AIX is not a supported combination.) +endif +ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), CLANGGFORTRANAIX) +$(error Using Clang and gFortran on AIX is not a supported combination.) +endif + ifeq ($(OSNAME), AIX) +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mpowerpc64 -maix64 +else +CCOMMON_OPT += -m64 +endif ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mpowerpc64 -maix64 endif diff --git a/Makefile.prebuild b/Makefile.prebuild index 0be4f1274..b7d695a75 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V) TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif +ifeq ($(TARGET), CK860FV) +TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +endif + +ifeq ($(TARGET), x280) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d +endif + +ifeq ($(TARGET), RISCV64_ZVL256B) +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d +endif + +ifeq ($(TARGET), RISCV64_ZVL128B) +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d +endif + +ifeq ($(TARGET), RISCV64_GENERIC) +TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce91e03ec..113cc57c5 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -2,3 +2,19 @@ ifeq ($(CORE), C910V) CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif +ifeq ($(CORE), x280) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static +endif +ifeq ($(CORE), RISCV64_ZVL256B) +CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +endif +ifeq ($(CORE), RISCV64_ZVL128B) +CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +endif +ifeq ($(CORE), RISCV64_GENERIC) +CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static +endif diff --git a/Makefile.rule b/Makefile.rule index ab46fd075..02ec739fd 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,12 @@ # # This library's version -VERSION = 0.3.23 +VERSION = 0.3.26.dev + +# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a +# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library +# +# LIBNAMEPREFIX = scipy # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index 343b94bb3..aadf3459a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -277,10 +277,6 @@ endif ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 -# Determine if the assembler is GNU Assembler -HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) -GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) - # Generating Makefile.conf and config.h DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) @@ -384,6 +380,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif +ifeq ($(C_COMPILER), CLANG) +CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) +endif + # # OS dependent settings # @@ -392,11 +393,22 @@ ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET ifeq ($(ARCH), arm64) export MACOSX_DEPLOYMENT_TARGET=11.0 +ifeq ($(C_COMPILER), GCC) +export NO_SVE = 1 +endif else export MACOSX_DEPLOYMENT_TARGET=10.8 endif endif MD5SUM = md5 -r +XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) +ifeq (x$(XCVER)x,xx) +XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) +endif +ifeq (x$(XCVER), x 15) +CCOMMON_OPT += -Wl,-ld_classic +FCOMMON_OPT += -Wl,-ld_classic +endif endif ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) @@ -597,6 +609,9 @@ endif ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -fopenmp +ifeq ($(F_COMPILER), GFORTRAN) +FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB)) +endif endif ifeq ($(C_COMPILER), INTEL) @@ -645,7 +660,7 @@ DYNAMIC_CORE += HASWELL ZEN endif ifneq ($(NO_AVX512), 1) ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += SKYLAKEX COOPERLAKE +DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS endif endif endif @@ -662,15 +677,12 @@ ifeq ($(ARCH), arm64) DYNAMIC_CORE = ARMV8 DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 -DYNAMIC_CORE += CORTEXA72 -DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 +DYNAMIC_CORE += ARMV8SVE endif -DYNAMIC_CORE += CORTEXA55 -DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 @@ -744,7 +756,11 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif +ifeq ($(OSNAME), AIX) +LDVERSIONGTEQ35 := 1 +else LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) +endif ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 CCOMMON_OPT += -DHAVE_P10_SUPPORT @@ -857,6 +873,11 @@ endif endif endif +ifeq ($(ARCH), csky) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + # # C Compiler dependent settings # @@ -932,8 +953,12 @@ BINARY_DEFINED = 1 endif ifeq ($(ARCH), loongarch64) -CCOMMON_OPT += -march=loongarch64 -mabi=lp64 -FCOMMON_OPT += -march=loongarch64 -mabi=lp64 +LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) +ifneq ($(LA64_ABI), lp64d) +LA64_ABI=lp64 +endif +CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) +FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) endif endif @@ -1082,8 +1107,9 @@ endif endif endif -ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) CCOMMON_OPT += -DF_INTERFACE_GFORT +ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive @@ -1097,6 +1123,7 @@ EXTRALIB += -lgfortran endif endif endif +endif ifdef NO_BINARY_MODE ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 @@ -1152,6 +1179,10 @@ endif ifeq ($(F_COMPILER), IBM) CCOMMON_OPT += -DF_INTERFACE_IBM +FEXTRALIB += -lxlf90 +ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG)) +FCOMMON_OPT += -qextname +endif # FCOMMON_OPT += -qarch=440 ifdef BINARY64 FCOMMON_OPT += -q64 @@ -1348,6 +1379,8 @@ ifeq ($(F_COMPILER), SUN) FCOMMON_OPT += -pic else ifeq ($(F_COMPILER), NAG) FCOMMON_OPT += -PIC +else ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -qpic=large else FCOMMON_OPT += -fPIC endif @@ -1483,16 +1516,28 @@ ifndef LIBSONAMEBASE LIBSONAMEBASE = openblas endif +ifndef LIBNAMEPREFIX +LIBNAMEPREFIX = +endif + +SYMPREFIX=$(SYMBOLPREFIX) +ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX)) +SYMPREFIX= +endif +SYMSUFFIX=$(SYMBOLSUFFIX) +ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX)) +SYMSUFFIX= +endif ifndef LIBNAMESUFFIX -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) +LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX) else -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) -LIBPREFIX = cyg$(LIBNAMEBASE) +LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE) else -LIBPREFIX = lib$(LIBNAMEBASE) +LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE) endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) @@ -1600,9 +1645,11 @@ override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) ifeq ($(NEED_PIC), 1) ifeq (,$(findstring PIC,$(FFLAGS))) +ifneq ($(F_COMPILER),IBM) override FFLAGS += -fPIC endif endif +endif #For LAPACK Fortran codes. #Disable -fopenmp for LAPACK Fortran codes on Windows. @@ -1616,11 +1663,11 @@ endif ifeq ($(F_COMPILER),NAG) LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) endif ifeq ($(F_COMPILER),CRAY) LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) endif LAPACK_CFLAGS = $(CFLAGS) @@ -1669,14 +1716,14 @@ LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) endif endif +ifeq ($(FIXED_LIBNAME),1) + LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX) + LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX) +endif LIBDLLNAME = $(LIBPREFIX).dll IMPLIBNAME = lib$(LIBNAMEBASE).dll.a -ifneq ($(OSNAME), AIX) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) -else -LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) -endif LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) @@ -1763,6 +1810,8 @@ export TARGET_CORE export NO_AVX512 export NO_AVX2 export BUILD_BFLOAT16 +export NO_LSX +export NO_LASX export SBGEMM_UNROLL_M export SBGEMM_UNROLL_N diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 7ab331b1f..c0dbe84c8 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -75,18 +75,31 @@ endif ifeq ($(CORE), COOPERLAKE) ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) -# cooperlake support was added in 10.1 -ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) -CCOMMON_OPT += -march=cooperlake -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=cooperlake -endif -else # gcc not support, fallback to avx512 -CCOMMON_OPT += -march=skylake-avx512 -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=skylake-avx512 -endif -endif + # cooperlake support was added in 10.1 + ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) + CCOMMON_OPT += -march=cooperlake + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=cooperlake + endif + else # gcc not support, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif +else ifeq ($(C_COMPILER), CLANG) + # cooperlake support was added in clang 9 + ifeq ($(CLANGVERSIONGTEQ9), 1) + CCOMMON_OPT += -march=cooperlake + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=cooperlake + endif + else # not supported in clang, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables @@ -104,18 +117,31 @@ endif ifeq ($(CORE), SAPPHIRERAPIDS) ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) -# sapphire rapids support was added in 11 -ifeq ($(GCCVERSIONGTEQ11), 1) -CCOMMON_OPT += -march=sapphirerapids -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=sapphirerapids -endif -else # gcc not support, fallback to avx512 -CCOMMON_OPT += -march=skylake-avx512 -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=skylake-avx512 -endif -endif + # sapphire rapids support was added in 11 + ifeq ($(GCCVERSIONGTEQ11), 1) + CCOMMON_OPT += -march=sapphirerapids + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=sapphirerapids + endif + else # gcc not support, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif +else ifeq ($(C_COMPILER), CLANG) + # sapphire rapids support was added in clang 12 + ifeq ($(CLANGVERSIONGTEQ12), 1) + CCOMMON_OPT += -march=sapphirerapids + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=sapphirerapids + endif + else # not supported in clang, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables diff --git a/README.md b/README.md index 6ce85e08e..43f390db0 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,15 @@ Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=dev AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) -Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/) +Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS) + + [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) +OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/) +OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/) ## Introduction OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. @@ -50,10 +54,15 @@ Building OpenBLAS requires the following to be installed: Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. -The full target list is in the file `TargetList.txt`. For building with `cmake`, the -usual conventions apply, i.e. create a build directory either underneath the toplevel -OpenBLAS source directory or separate from it, and invoke `cmake` there with the path -to the source tree and any build options you plan to set. +The full target list is in the file `TargetList.txt`, other build optionss are documented in Makefile.rule and +can either be set there (typically by removing the comment character from the respective line), or used on the +`make` command line. +Note that when you run `make install` after building, you need to repeat all command line options you provided to `make` +in the build step, as some settings like the supported maximum number of threads are automatically derived from the +build host by default, which might not be what you want. +For building with `cmake`, the usual conventions apply, i.e. create a build directory either underneath the toplevel +OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any +build options you plan to set. ### Cross compile @@ -113,7 +122,7 @@ Use `PREFIX=` when invoking `make`, for example ```sh make install PREFIX=your_installation_directory ``` - +(along with all options you added on the `make` command line in the preceding build step) The default installation directory is `/opt/OpenBLAS`. ## Supported CPUs and Operating Systems @@ -133,7 +142,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. -- **AMD ZEN**: Uses Haswell codes with some optimizations. +- **AMD ZEN**: Uses Haswell codes with some optimizations for Zen 2/3 (use SkylakeX for Zen4) #### MIPS32 @@ -165,13 +174,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **TSV110**: Optimized some Level-3 helper functions - **EMAG 8180**: preliminary support based on A57 - **Neoverse N1**: (AWS Graviton2) preliminary support -- **Apple Vortex**: preliminary support based on ARMV8 +- **Neoverse V1**: (AWS Graviton3) optimized Level-3 BLAS +- **Apple Vortex**: preliminary support based on ThunderX2/3 +- **A64FX**: preliminary support, optimized Level-3 BLAS +- **ARMV8SVE**: any ARMV8 cpu with SVE extensions #### PPC/PPC64 - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. -- **POWER10**: +- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2. #### IBM zEnterprise System @@ -184,20 +196,37 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` - (also known to work on C906) + (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) + +- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. + ```sh + make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran + ``` + +- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available. +e.g.: + ```sh +make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \ + BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \ + AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \ + LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \ + HOSTCC=gcc HOSTFC=gfortran -j + ``` ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. -For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default. +For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default. `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. -On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. +On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled. + +For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build. -For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. +on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. diff --git a/TargetList.txt b/TargetList.txt index deef75819..115030c1b 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -118,8 +118,11 @@ Z13 Z14 10.RISC-V 64: -RISCV64_GENERIC +RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) +RISCV64_ZVL128B C910V +x280 +RISCV64_ZVL256B 11.LOONGARCH64: LOONGSONGENERIC @@ -133,3 +136,7 @@ E2K EV4 EV5 EV6 + +14.CSKY +CSKY +CK860FV diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 16b9da4f5..3ae8615a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -115,7 +115,7 @@ jobs: mkdir build cd build call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" - cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. cmake --build . --config Release ctest @@ -167,11 +167,10 @@ jobs: - job: OSX_OpenMP_Clang pool: - vmImage: 'macOS-11' + vmImage: 'macOS-latest' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib - MACOSX_DEPLOYMENT_TARGET: 11.0 steps: - script: | brew update @@ -180,7 +179,7 @@ jobs: - job: OSX_OpenMP_Clang_cmake pool: - vmImage: 'macOS-11' + vmImage: 'macOS-latest' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -210,7 +209,7 @@ jobs: - job: OSX_Ifort_Clang pool: - vmImage: 'macOS-11' + vmImage: 'macOS-latest' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg @@ -271,14 +270,27 @@ jobs: - script: | make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 +- job: OSX_xbuild_DYNAMIC_ARM64 + pool: + vmImage: 'macOS-11' + variables: + CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64 + steps: + - script: | + ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs + /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus + /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version + make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + - job: ALPINE_MUSL pool: vmImage: 'ubuntu-latest' steps: - script: | - wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ - && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ - || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \ + && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06 alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 diff --git a/benchmark/Makefile b/benchmark/Makefile index d9ddb9042..b7493950a 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +# x280 temporary workaround for gfortran +ifeq ($(TARGET), x280) +CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) +endif + + ifneq ($(NO_LAPACK), 1) GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ @@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME) clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling -include $(TOPDIR)/Makefile.tail +include $(TOPDIR)/Makefile.tail \ No newline at end of file diff --git a/benchmark/spr.c b/benchmark/spr.c old mode 100755 new mode 100644 diff --git a/benchmark/spr2.c b/benchmark/spr2.c old mode 100755 new mode 100644 diff --git a/benchmark/syrk.c b/benchmark/syrk.c index fa0f24666..e0ae58707 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2014, 2023 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -67,7 +67,7 @@ int main(int argc, char *argv[]){ int step = 1; int loops = 1; - if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p); double time1,timeg; @@ -77,7 +77,7 @@ int main(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ diff --git a/benchmark/trsv.c b/benchmark/trsv.c index 66ac3a3c7..e17c57157 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -127,7 +127,7 @@ int main(int argc, char *argv[]){ long long muls = n*(n+1)/2.0; long long adds = (n - 1.0)*n/2.0; - fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg); + fprintf(stderr, "%10d : %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg); if(a != NULL){ free(a); } diff --git a/c_check b/c_check index e8f90e18a..59ab9bb13 100755 --- a/c_check +++ b/c_check @@ -31,13 +31,17 @@ flags="$*" cross_suffix="" -if [ "`dirname \"$compiler_name\"`" != '.' ]; then - cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/" +if [ "`dirname "$compiler_name"`" != '.' ]; then + cross_suffix="$cross_suffix`dirname "$compiler_name"`/" fi -bn=`basename $compiler_name` +cn=`echo $compiler_name | sed -e 's/ -.*//'` +bn=`basename "$cn"` + case "$bn" in - *-*) cross_suffix="$cross_suffix${bn%-*}-" + *-*) if [ "$bn" != '-' ]; then + cross_suffix="$cross_suffix${bn%-*}-" + fi esac compiler="" @@ -87,16 +91,25 @@ case "$data" in *ARCH_ZARCH*) architecture=zarch ;; *ARCH_RISCV64*) architecture=riscv64 ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;; + *ARCH_CSKY*) architecture=csky ;; esac defined=0 if [ "$os" = "AIX" ]; then - case "$BINARY" in - 32) compiler_name="$compiler_name -maix32" ;; - 64) compiler_name="$compiler_name -maix64" ;; - esac - defined=1 + if [ "$compiler" = "GCC" ]; then + case "$BINARY" in + 32) compiler_name="$compiler_name -maix32" ;; + 64) compiler_name="$compiler_name -maix64" ;; + esac + defined=1 + else + case "$BINARY" in + 32) compiler_name="$compiler_name -m32" ;; + 64) compiler_name="$compiler_name -m64" ;; + esac + defined=1 + fi fi case "$architecture" in @@ -164,7 +177,7 @@ fi no_msa=0 if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then - tmpd="$(mktemp -d)" + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" code='"addvi.b $w0, $w1, 1"' msa_flags='-mmsa -mfp64 -mload-store-pairs' @@ -181,6 +194,35 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then rm -rf "$tmpd" fi +no_lsx=0 +no_lasx=0 +if [ "$architecture" = "loongarch64" ]; then + tmpd="$(mktemp -d)" + tmplsx="$tmpd/lsx.c" + codelsx='"vadd.b $vr0, $vr0, $vr0"' + lsx_flags='-march=loongarch64' + printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" + args="$lsx_flags -o $tmplsx.o $tmplsx" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_lsx=1 + } + + tmplasx="$tmpd/lasx.c" + codelasx='"xvadd.b $xr0, $xr0, $xr0"' + lasx_flags='-march=loongarch64' + printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" + args="$lasx_flags -o $tmplasx.o $tmplasx" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_lasx=1 + } + + rm -rf "$tmpd" +fi + case "$data" in *ARCH_X86_64*) architecture=x86_64 ;; *ARCH_X86*) architecture=x86 ;; @@ -195,6 +237,7 @@ case "$data" in *ARCH_ARM*) architecture=arm ;; *ARCH_ZARCH*) architecture=zarch ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;; + *ARCH_CSKY*) architecture=csky ;; esac binformat='bin32' @@ -203,8 +246,9 @@ case "$data" in esac no_avx512=0 +no_avx512bf=0 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then - tmpd=`mktemp -d` + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" code='"vbroadcastss -4 * 4(%rsi), %zmm2"' printf "#include \n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" @@ -221,11 +265,30 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then } rm -rf "$tmpd" + if [ "$no_avx512" -eq 0 ]; then + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') + tmpf="$tmpd/a.c" + code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"' + printf "#include \n\nint main(void){ %s; }\n" "$code" >> "$tmpf" + if [ "$compiler" = "PGI" ]; then + args=" -tp cooperlake -c -o $tmpf.o $tmpf" + else + args=" -march=cooperlake -c -o $tmpf.o $tmpf" + fi + no_avx512bf=0 + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_avx512bf=1 + } + + rm -rf "$tmpd" + fi fi no_rv64gv=0 if [ "$architecture" = "riscv64" ]; then - tmpd=`mktemp -d` + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" code='"vsetvli zero, zero, e8, m1\n"' printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" @@ -241,13 +304,16 @@ fi no_sve=0 if [ "$architecture" = "arm64" ]; then - tmpd=`mktemp -d` + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" printf "#include \n\n int main(void){}\n">> "$tmpf" args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf" no_sve=0 { $compiler_name $flags $args >/dev/null 2>&1 + } || { + args=" -Msve_intrinsics -c -o $tmpf.o $tmpf" + $compiler_name $flags $args >/dev/null 2>&1 } || { no_sve=1 } @@ -257,7 +323,7 @@ fi c11_atomics=0 case "$data" in *HAVE_C11*) - tmpd=`mktemp -d` + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" printf "#include \nint main(void){}\n" >> "$tmpf" args=" -c -o $tmpf.o $tmpf" @@ -365,6 +431,7 @@ done [ "$makefile" = "-" ] && { [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" exit 0 @@ -393,8 +460,11 @@ done [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" + [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" } >> "$makefile" os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` @@ -410,6 +480,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" + [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" + [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" } >> "$config" diff --git a/c_check.pl b/c_check.pl index 6ce28e11b..6b89f06eb 100644 --- a/c_check.pl +++ b/c_check.pl @@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); +$architecture = csky if ($data =~ /ARCH_CSKY/); $defined = 0; @@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") { $binary = 64; } +if ($architecture eq "csky") { + $defined = 1; + $binary = 32; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -232,6 +238,45 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } } +$no_lsx = 0; +$no_lasx = 0; +if (($architecture eq "loongarch64")) { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; + } else { + $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); + $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; + $lsx_flags = "-march=loongarch64"; + print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; + + $args = "$lsx_flags -o $tmplsx.o $tmplsx"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_lsx = 1; + } else { + $no_lsx = 0; + } + unlink("$tmplsx.o"); + + $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); + $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; + $lasx_flags = "-march=loongarch64"; + print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; + + $args = "$lasx_flags -o $tmplasx.o $tmplasx"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_lasx = 1; + } else { + $no_lasx = 0; + } + unlink("$tmplasx.o"); + } +} + $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = e2k if ($data =~ /ARCH_E2K/); @@ -245,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); +$architecture = csky if ($data =~ /ARCH_CSKY/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); @@ -424,6 +470,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; +print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; +print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; @@ -437,6 +485,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; +print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; +print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; if ($os eq "LINUX") { diff --git a/cblas.h b/cblas.h index c2bdd27fa..beaa32cc2 100644 --- a/cblas.h +++ b/cblas.h @@ -12,6 +12,7 @@ extern "C" { /*Set the number of threads on runtime.*/ void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +int openblas_set_num_threads_local(int num_threads); /*Get the number of threads on runtime.*/ int openblas_get_num_threads(void); @@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + +float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); @@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); + void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); @@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); +void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); +void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); @@ -350,7 +372,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_xerbla(blasint p, char *rout, char *form, ...); +void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...); /*** BLAS extensions ***/ diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 8521f3988..eb974456b 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,9 +44,9 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2") + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) @@ -82,7 +82,7 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () if (NOT NO_AVX512) - set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") endif () if (DYNAMIC_LIST) @@ -135,7 +135,7 @@ if (ARM64) set(BINARY_DEFINED 1) endif () -if (${ARCH} STREQUAL "riscv64") +if (RISCV64) set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 83b8d15ab..2da941afb 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (LOONGARCH64) if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + endif () else () - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + endif () endif () set(BINARY_DEFINED 1) endif () @@ -65,6 +75,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI") endif () endif () +if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") + if (POWER) + set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") + endif () +endif () + if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE") if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -m64") @@ -172,22 +190,30 @@ endif () if (${CORE} STREQUAL NEOVERSEN2) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") - endif() + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () endif () endif () if (${CORE} STREQUAL NEOVERSEV1) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() endif() endif () endif () @@ -205,7 +231,11 @@ endif () if (${CORE} STREQUAL ARMV8SVE) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () endif () endif () @@ -262,6 +292,27 @@ if (${CORE} STREQUAL POWER8) endif () endif () +# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone +# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case. +if (${CORE} STREQUAL PPC970) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math") + endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () +endif () + +# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency? +if (${CORE} STREQUAL PPCG4) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math") + endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () +endif () + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index df3a4858d..4c4f5ac04 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -64,6 +64,7 @@ else () "#define NEEDBUNDERSCORE 1\n") endif() +if (CMAKE_Fortran_COMPILER) get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) string(TOUPPER ${F_COMPILER} F_COMPILER) - +endif() diff --git a/cmake/fc.cmake b/cmake/fc.cmake index e615e148e..bc85a2921 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,11 +3,9 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. -if (${F_COMPILER} STREQUAL "FLANG") +if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + # This is for classic Flang. LLVM Flang is handled with gfortran below. set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") - if (BINARY64 AND INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -i8") - endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () @@ -38,31 +36,56 @@ if (${F_COMPILER} STREQUAL "G95") endif () endif () -if (${F_COMPILER} STREQUAL "GFORTRAN") +if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - # ensure reentrancy of lapack codes - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") - # work around ABI violation in passing string arguments from C - set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") - #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc - if (NOT NO_LAPACK) - set(EXTRALIB "${EXTRALIB} -lgfortran") + if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + # ensure reentrancy of lapack codes + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") + # work around ABI violation in passing string arguments from C + set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") + if (NOT NO_LAPACK) + # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc + set(EXTRALIB "${EXTRALIB} -lgfortran") + endif () endif () if (NO_BINARY_MODE) if (MIPS64) if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () else () set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") endif () endif () if (LOONGARCH64) if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + endif () else () - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () + endif () + endif () + if (RISCV64) + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () endif () endif () + if (ARM64 AND INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") @@ -121,7 +144,7 @@ if (${F_COMPILER} STREQUAL "IBM") endif () endif () -if (${F_COMPILER} STREQUAL "PGI") +if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") if (BINARY64) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 45dda8686..003a8b3c1 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -52,7 +52,7 @@ set(SLASRC sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f - sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f + sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f sgetrf2.f sgetri.f sggbak.f sggbal.f @@ -67,7 +67,7 @@ set(SLASRC slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f slansy.f slantb.f slantp.f slantr.f slanv2.f slapll.f slapmt.f - slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f + slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f @@ -124,7 +124,7 @@ set(SLASRC ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f sgesvdq.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f - slatrs3.f strsyl3.f sgelst.f) + slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -139,7 +139,7 @@ set(CLASRC cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f - cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f + cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f cgesvdx.f cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f @@ -173,7 +173,7 @@ set(CLASRC clanhb.f clanhe.f clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f - claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f + claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqz0.f claqz1.f claqz2.f claqz3.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f @@ -187,7 +187,7 @@ set(CLASRC cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f - crot.f cspcon.f csprfs.f cspsv.f + crot.f crscl.f cspcon.f csprfs.f cspsv.f cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f cstegr.f cstein.f csteqr.f csycon.f csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f @@ -223,7 +223,7 @@ set(CLASRC chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f cungtsqr.f cungtsqr_row.f cunhr_col.f - clatrs3.f ctrsyl3.f cgelst.f) + clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -243,7 +243,7 @@ set(DLASRC dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f - dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f + dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f dgetrf2.f dgetri.f dggbak.f dggbal.f @@ -258,7 +258,7 @@ set(DLASRC dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f dlapll.f dlapmt.f - dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f + dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f @@ -316,7 +316,7 @@ set(DLASRC dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f - dlatrs3.f dtrsyl3.f dgelst.f) + dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -331,7 +331,7 @@ set(ZLASRC zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f - zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f + zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f @@ -367,7 +367,7 @@ set(ZLASRC zlanhe.f zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f - zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f + zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f @@ -381,7 +381,7 @@ set(ZLASRC zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f - zrot.f zspcon.f zsprfs.f zspsv.f + zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f zstegr.f zstein.f zsteqr.f zsycon.f zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f @@ -419,7 +419,7 @@ set(ZLASRC zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f zungtsqr.f zungtsqr_row.f zunhr_col.f - zlatrs3.f ztrsyl3.f zgelst.f) + zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f @@ -436,19 +436,25 @@ if(USE_XBLAS) set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) endif() +if(BUILD_LAPACK_DEPRECATED) list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f + DEPRECATED/sgelqs.f DEPRECATED/sgeqrs.f DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f + DEPRECATED/dgelqs.f DEPRECATED/dgeqrs.f DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f) list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f + DEPRECATED/cgelqs.f DEPRECATED/cgeqrs.f DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f) list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f + DEPRECATED/zgelqs.f DEPRECATED/zgeqrs.f DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) message(STATUS "Building deprecated routines") +endif() set(DSLASRC spotrs.f) @@ -551,7 +557,7 @@ set(SLASRC sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c sgehd2.c sgehrd.c sgelq2.c sgelqf.c sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c - sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c + sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c sgetrf2.c sgetri.c sggbak.c sggbal.c @@ -565,7 +571,7 @@ set(SLASRC slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c slansy.c slantb.c slantp.c slantr.c slanv2.c slapll.c slapmt.c - slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c + slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c @@ -622,7 +628,7 @@ set(SLASRC ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c sgesvdq.c slaorhr_col_getrfnp.c slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c - slatrs3.c strsyl3.c sgelst.c) + slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c) set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c @@ -637,7 +643,7 @@ set(CLASRC cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c cgehd2.c cgehrd.c cgelq2.c cgelqf.c - cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c + cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c cgesc2.c cgesdd.c cgesvd.c cgesvdx.c cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c @@ -671,7 +677,7 @@ set(CLASRC clanhb.c clanhe.c clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c - claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c + claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c @@ -684,7 +690,7 @@ set(CLASRC cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c - crot.c cspcon.c csprfs.c cspsv.c + crot.c crscl.c cspcon.c csprfs.c cspsv.c cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c cstegr.c cstein.c csteqr.c csycon.c csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c @@ -720,7 +726,7 @@ set(CLASRC chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c cungtsqr.c cungtsqr_row.c cunhr_col.c - clatrs3.c ctrsyl3.c cgelst.c) + clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c) set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c @@ -740,7 +746,7 @@ set(DLASRC dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c dgehd2.c dgehrd.c dgelq2.c dgelqf.c dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c - dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c + dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c dgetrf2.c dgetri.c dggbak.c dggbal.c @@ -754,7 +760,7 @@ set(DLASRC dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c dlapll.c dlapmt.c - dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c + dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c @@ -812,7 +818,7 @@ set(DLASRC dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c - dlatrs3.c dtrsyl3.c dgelst.c) + dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c) set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c @@ -827,7 +833,7 @@ set(ZLASRC zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c zgehd2.c zgehrd.c zgelq2.c zgelqf.c - zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c + zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c @@ -862,7 +868,7 @@ set(ZLASRC zlanhe.c zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c - zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c + zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c @@ -876,7 +882,7 @@ set(ZLASRC zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c - zrot.c zspcon.c zsprfs.c zspsv.c + zrot.c zrscl.c zspcon.c zsprfs.c zspsv.c zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c zstegr.c zstein.c zsteqr.c zsycon.c zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c @@ -913,7 +919,8 @@ set(ZLASRC zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c - zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c) + zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c + zgedmd.c zgedmdq.c) set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c @@ -930,19 +937,25 @@ if(USE_XBLAS) set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) endif() +if(BUILD_LAPACK_DEPRECATED) list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c + DEPRECATED/sgelqs.c DEPRECATED/sgeqrs.c DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c + DEPRECATED/dgelqs.c DEPRECATED/dgeqrs.c DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c) list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c + DEPRECATED/cgelqs.c DEPRECATED/cgeqrs.c DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c) list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c + DEPRECATED/zgelqs.c DEPRECATED/zgeqrs.c DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) message(STATUS "Building deprecated routines") +endif() set(DSLASRC spotrs.c) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 3a9352197..f43bf10d0 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -70,8 +70,6 @@ set(CSRC lapacke_cgeqlf_work.c lapacke_cgeqp3.c lapacke_cgeqp3_work.c - lapacke_cgeqpf.c - lapacke_cgeqpf_work.c lapacke_cgeqr.c lapacke_cgeqr_work.c lapacke_cgeqr2.c @@ -92,6 +90,10 @@ set(CSRC lapacke_cgerqf_work.c lapacke_cgesdd.c lapacke_cgesdd_work.c + lapacke_cgedmd.c + lapacke_cgedmd_work.c + lapacke_cgedmdq.c + lapacke_cgedmdq_work.c lapacke_cgesv.c lapacke_cgesv_work.c lapacke_cgesvd.c @@ -144,12 +146,8 @@ set(CSRC lapacke_cggqrf_work.c lapacke_cggrqf.c lapacke_cggrqf_work.c - lapacke_cggsvd.c - lapacke_cggsvd_work.c lapacke_cggsvd3.c lapacke_cggsvd3_work.c - lapacke_cggsvp.c - lapacke_cggsvp_work.c lapacke_cggsvp3.c lapacke_cggsvp3_work.c lapacke_cgtcon.c @@ -564,6 +562,8 @@ set(CSRC lapacke_ctrsna_work.c lapacke_ctrsyl.c lapacke_ctrsyl_work.c + lapacke_ctrsyl3.c + lapacke_ctrsyl3_work.c lapacke_ctrtri.c lapacke_ctrtri_work.c lapacke_ctrtrs.c @@ -596,6 +596,8 @@ set(CSRC lapacke_cungtr_work.c lapacke_cungtsqr_row.c lapacke_cungtsqr_row_work.c + lapacke_cunhr_col.c + lapacke_cunhr_col_work.c lapacke_cunmbr.c lapacke_cunmbr_work.c lapacke_cunmhr.c @@ -695,8 +697,6 @@ set(DSRC lapacke_dgeqlf_work.c lapacke_dgeqp3.c lapacke_dgeqp3_work.c - lapacke_dgeqpf.c - lapacke_dgeqpf_work.c lapacke_dgeqr.c lapacke_dgeqr_work.c lapacke_dgeqr2.c @@ -717,6 +717,10 @@ set(DSRC lapacke_dgerqf_work.c lapacke_dgesdd.c lapacke_dgesdd_work.c + lapacke_dgedmd.c + lapacke_dgedmd_work.c + lapacke_dgedmdq.c + lapacke_dgedmdq_work.c lapacke_dgesv.c lapacke_dgesv_work.c lapacke_dgesvd.c @@ -771,12 +775,8 @@ set(DSRC lapacke_dggqrf_work.c lapacke_dggrqf.c lapacke_dggrqf_work.c - lapacke_dggsvd.c - lapacke_dggsvd_work.c lapacke_dggsvd3.c lapacke_dggsvd3_work.c - lapacke_dggsvp.c - lapacke_dggsvp_work.c lapacke_dggsvp3.c lapacke_dggsvp3_work.c lapacke_dgtcon.c @@ -874,6 +874,8 @@ set(DSRC lapacke_dorgtr_work.c lapacke_dorgtsqr_row.c lapacke_dorgtsqr_row_work.c + lapacke_dorhr_col.c + lapacke_dorhr_col_work.c lapacke_dormbr.c lapacke_dormbr_work.c lapacke_dormhr.c @@ -1186,6 +1188,8 @@ set(DSRC lapacke_dtrsna_work.c lapacke_dtrsyl.c lapacke_dtrsyl_work.c + lapacke_dtrsyl3.c + lapacke_dtrsyl3_work.c lapacke_dtrtri.c lapacke_dtrtri_work.c lapacke_dtrtrs.c @@ -1275,8 +1279,6 @@ set(SSRC lapacke_sgeqlf_work.c lapacke_sgeqp3.c lapacke_sgeqp3_work.c - lapacke_sgeqpf.c - lapacke_sgeqpf_work.c lapacke_sgeqr.c lapacke_sgeqr_work.c lapacke_sgeqr2.c @@ -1297,6 +1299,10 @@ set(SSRC lapacke_sgerqf_work.c lapacke_sgesdd.c lapacke_sgesdd_work.c + lapacke_sgedmd.c + lapacke_sgedmd_work.c + lapacke_sgedmdq.c + lapacke_sgedmdq_work.c lapacke_sgesv.c lapacke_sgesv_work.c lapacke_sgesvd.c @@ -1351,12 +1357,8 @@ set(SSRC lapacke_sggqrf_work.c lapacke_sggrqf.c lapacke_sggrqf_work.c - lapacke_sggsvd.c - lapacke_sggsvd_work.c lapacke_sggsvd3.c lapacke_sggsvd3_work.c - lapacke_sggsvp.c - lapacke_sggsvp_work.c lapacke_sggsvp3.c lapacke_sggsvp3_work.c lapacke_sgtcon.c @@ -1453,6 +1455,8 @@ set(SSRC lapacke_sorgtr_work.c lapacke_sorgtsqr_row.c lapacke_sorgtsqr_row_work.c + lapacke_sorhr_col.c + lapacke_sorhr_col_work.c lapacke_sormbr.c lapacke_sormbr_work.c lapacke_sormhr.c @@ -1762,6 +1766,8 @@ set(SSRC lapacke_strsna_work.c lapacke_strsyl.c lapacke_strsyl_work.c + lapacke_ctrsyl3.c + lapacke_ctrsyl3_work.c lapacke_strtri.c lapacke_strtri_work.c lapacke_strtrs.c @@ -1849,8 +1855,6 @@ set(ZSRC lapacke_zgeqlf_work.c lapacke_zgeqp3.c lapacke_zgeqp3_work.c - lapacke_zgeqpf.c - lapacke_zgeqpf_work.c lapacke_zgeqr.c lapacke_zgeqr_work.c lapacke_zgeqr2.c @@ -1871,6 +1875,10 @@ set(ZSRC lapacke_zgerqf_work.c lapacke_zgesdd.c lapacke_zgesdd_work.c + lapacke_zgedmd.c + lapacke_zgedmd_work.c + lapacke_zgedmdq.c + lapacke_zgedmdq_work.c lapacke_zgesv.c lapacke_zgesv_work.c lapacke_zgesvd.c @@ -1925,12 +1933,8 @@ set(ZSRC lapacke_zggqrf_work.c lapacke_zggrqf.c lapacke_zggrqf_work.c - lapacke_zggsvd.c - lapacke_zggsvd_work.c lapacke_zggsvd3.c lapacke_zggsvd3_work.c - lapacke_zggsvp.c - lapacke_zggsvp_work.c lapacke_zggsvp3.c lapacke_zggsvp3_work.c lapacke_zgtcon.c @@ -2343,6 +2347,8 @@ set(ZSRC lapacke_ztrsna_work.c lapacke_ztrsyl.c lapacke_ztrsyl_work.c + lapacke_ztrsyl3.c + lapacke_ztrsyl3_work.c lapacke_ztrtri.c lapacke_ztrtri_work.c lapacke_ztrtrs.c @@ -2375,6 +2381,8 @@ set(ZSRC lapacke_zungtr_work.c lapacke_zungtsqr_row.c lapacke_zungtsqr_row_work.c + lapacke_zunhr_col.c + lapacke_zunhr_col_work.c lapacke_zunmbr.c lapacke_zunmbr_work.c lapacke_zunmhr.c @@ -2401,6 +2409,12 @@ set(ZSRC lapacke_csyr_work.c lapacke_ilaver.c ) +if (BUILD_LAPACK_DEPRECATED) +set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) +set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) +set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) +set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) +endif() set(SRCX lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 7e120af86..9526d2df6 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,11 +1,13 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +libnameprefix=@LIBNAMEPREFIX@ +libnamesuffix=@LIBNAMESUFFIX@ libsuffix=@SUFFIX64_UNDERSCORE@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version -Version: @OPENBLAS_VERSION@ -URL: https://github.com/xianyi/OpenBLAS -Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} +Version: @OpenBLAS_VERSION@ +URL: https://github.com/OpenMathLib/OpenBLAS +Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} Cflags: -I${includedir} diff --git a/cmake/system.cmake b/cmake/system.cmake index 631e7fe69..95f34e9f8 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -55,7 +55,7 @@ if (DEFINED TARGET) endif () # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. -if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI") +if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")) set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () @@ -280,7 +280,41 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL POWER8) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") endif() + +if (${TARGET} STREQUAL NEOVERSEV1) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") + else () + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") + else () + message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") + endif() + endif() + endif() + if (${TARGET} STREQUAL NEOVERSEN2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () + message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") + endif() + endif() + endif() + if (${TARGET} STREQUAL ARMV8SVE) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve") + else () + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") + endif() + endif() + endif() + if (DEFINED BINARY) message(STATUS "Compiling a ${BINARY}-bit binary.") endif () @@ -467,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH endif() endif() endif() + +set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas") + if (DEFINED LIBNAMESUFFIX) - set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") -else () - set(LIBPREFIX "libopenblas") + set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}") endif () if (NOT DEFINED SYMBOLPREFIX) @@ -645,6 +680,10 @@ else () endif () endif () +if (DEFINED FIXED_LIBNAME) + set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}") + set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}") +endif() set(LIBDLLNAME "${LIBPREFIX}.dll") set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index c59e85d54..e94497a04 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -38,13 +38,15 @@ if(CMAKE_CL_64 OR MINGW64) endif() elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*")) set(POWER 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") set(LOONGARCH64 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") + set(RISCV64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*")) if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(X86_64 1) @@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(ARM64 1) else() @@ -107,7 +109,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) + if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc"))) set(BINARY 64) else () set(BINARY 32) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 56c1cb060..5e8ba866b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN) #message(STATUS "skipping ${makefile_line}") continue () endif () + + # Example 1: SBGEMM_SMALL_M_PERMIT = + # Unset the variable + string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + set(var_name ${CMAKE_MATCH_1}) + unset(${var_name}) + endif() + string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") #message(STATUS "match on ${line_match}") diff --git a/common.h b/common.h index 4eeeb8d55..1f0b9e533 100644 --- a/common.h +++ b/common.h @@ -396,7 +396,7 @@ typedef int blasint; #endif /*** -To alloc job_t on heap or statck. +To alloc job_t on heap or stack. please https://github.com/xianyi/OpenBLAS/issues/246 ***/ #if defined(OS_WINDOWS) @@ -482,6 +482,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_e2k.h" #endif +#ifdef ARCH_CSKY +#include "common_csky.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; @@ -525,7 +529,7 @@ static inline unsigned long long rpcc(void){ #endif // !RPCC_DEFINED #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ do { while (*address) {YIELDING;}; diff --git a/common_alpha.h b/common_alpha.h index 021eb93ae..e5380454a 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -45,7 +45,7 @@ #define WMB asm("wmb") #define RMB asm("mb") -static void __inline blas_lock(unsigned long *address){ +static __inline void blas_lock(unsigned long *address){ #ifndef __DECC unsigned long tmp1, tmp2; asm volatile( diff --git a/common_arm.h b/common_arm.h index 682315de5..a3db9953c 100644 --- a/common_arm.h +++ b/common_arm.h @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARMV6) || defined(ARMV7) || defined(ARMV8) -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ int register ret; diff --git a/common_arm64.h b/common_arm64.h index 6a18a294c..6ae6a35a3 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ BLASULONG ret; @@ -162,7 +162,11 @@ REALNAME: #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE +#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) +#define BUFFER_SIZE (32 << 22) +#else #define BUFFER_SIZE (32 << 20) +#endif #else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif diff --git a/common_csky.h b/common_csky.h new file mode 100644 index 000000000..3095dc781 --- /dev/null +++ b/common_csky.h @@ -0,0 +1,56 @@ +/***************************************************************************** +Copyright (c) 2011-2015, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_CSKY +#define COMMON_CSKY + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#endif + + + +#define BUFFER_SIZE ( 32 << 20) +#define SEEK_ADDRESS + +#endif \ No newline at end of file diff --git a/common_interface.h b/common_interface.h index 318827920..5a2e1654c 100644 --- a/common_interface.h +++ b/common_interface.h @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); + int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); @@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *); +void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *); void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); diff --git a/common_loongarch64.h b/common_loongarch64.h index e15539b5f..b1426da79 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } +#ifndef NO_AFFINITY +static inline int WhereAmI(void){ + int ret = 0, counter = 0; + __asm__ volatile ( + "rdtimel.w %[counter], %[id]" + : [id]"=r"(ret), [counter]"=r"(counter) + : + : "memory" + ); + return ret; +} +#endif + #ifdef DOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") #else @@ -106,12 +119,50 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define MOV fmov.d #define CMOVT fsel #define MTC movgr2fr.d +#define MTG movfr2gr.d #define FABS fabs.d +#define FMIN fmin.d +#define FMINA fmina.d +#define FMAX fmax.d +#define FMAXA fmaxa.d #define CMPEQ fcmp.ceq.d #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d #define NEG fneg.d +#define FFINT ffint.d.l + +#define XVFSUB xvfsub.d +#define XVFADD xvfadd.d +#define XVFMUL xvfmul.d +#define XVFMADD xvfmadd.d +#define XVFMIN xvfmin.d +#define XVFMINA xvfmina.d +#define XVFMAX xvfmax.d +#define XVFMAXA xvfmaxa.d +#define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLE xvfcmp.cle.d +#define XVCMPLT xvfcmp.clt.d +#define XVMUL xvfmul.d +#define XVMSUB xvfmsub.d +#define XVNMSUB xvfnmsub.d + +#define VFSUB vfsub.d +#define VFADD vfadd.d +#define VFMUL vfmul.d +#define VFMADD vfmadd.d +#define VFMIN vfmin.d +#define VFMINA vfmina.d +#define VFMAX vfmax.d +#define VFMAXA vfmaxa.d +#define VCMPEQ vfcmp.ceq.d +#define VCMPLE vfcmp.cle.d +#define VCMPLT vfcmp.clt.d +#define VMUL vfmul.d +#define VMSUB vfmsub.d +#define VNMSUB vfnmsub.d + #else + #define LD fld.s #define ST fst.s #define MADD fmadd.s @@ -124,11 +175,48 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define MOV fmov.s #define CMOVT fsel #define MTC movgr2fr.w +#define MTG movfr2gr.s #define FABS fabs.s +#define FMIN fmin.s +#define FMINA fmina.s +#define FMAX fmax.s +#define FMAXA fmaxa.s #define CMPEQ fcmp.ceq.s #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s #define NEG fneg.s +#define FFINT ffint.s.l + +#define XVFSUB xvfsub.s +#define XVFADD xvfadd.s +#define XVFMUL xvfmul.s +#define XVFMADD xvfmadd.s +#define XVFMIN xvfmin.s +#define XVFMINA xvfmina.s +#define XVFMAX xvfmax.s +#define XVFMAXA xvfmaxa.s +#define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLE xvfcmp.cle.s +#define XVCMPLT xvfcmp.clt.s +#define XVMUL xvfmul.s +#define XVMSUB xvfmsub.s +#define XVNMSUB xvfnmsub.s + +#define VFSUB vfsub.s +#define VFADD vfadd.s +#define VFMUL vfmul.s +#define VFMADD vfmadd.s +#define VFMIN vfmin.s +#define VFMINA vfmina.s +#define VFMAX vfmax.s +#define VFMAXA vfmaxa.s +#define VCMPEQ vfcmp.ceq.s +#define VCMPLE vfcmp.cle.s +#define VCMPLT vfcmp.clt.s +#define VMUL vfmul.s +#define VMSUB vfmsub.s +#define VNMSUB vfnmsub.s + #endif /* defined(DOUBLE) */ #if defined(__64BIT__) && defined(USE64BITINT) diff --git a/common_param.h b/common_param.h index 1854570f1..c082d248e 100644 --- a/common_param.h +++ b/common_param.h @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -45,12 +46,14 @@ typedef struct { int dtb_entries; + int switch_ratio; int offsetA, offsetB, align; #if BUILD_BFLOAT16 == 1 int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; int sbgemm_align_k; + int need_amxtile_permission; // 0 default, 1 for device support amx. void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); diff --git a/common_power.h b/common_power.h index a49197fd7..3fe776f23 100644 --- a/common_power.h +++ b/common_power.h @@ -91,7 +91,7 @@ void *qalloc(int flags, size_t bytes); -static void INLINE blas_lock(volatile unsigned long *address){ +static INLINE void blas_lock(volatile unsigned long *address){ long int ret, val = 1; diff --git a/common_riscv64.h b/common_riscv64.h index 7ddbe80a4..eccfc644f 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) -#include +#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280) +# include +#endif + +#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) +// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this +#define RISCV_0p10_INTRINSICS +#define RISCV_RVV(x) x +#else +#define RISCV_RVV(x) __riscv_ ## x +#endif + +#if defined(C910V) || defined(RISCV64_ZVL256B) +# if !defined(DOUBLE) +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) +# else +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) +# endif +#else +# define EXTRACT_FLOAT(v) (v[0]) #endif #endif diff --git a/common_sparc.h b/common_sparc.h index 90a24ebf1..4b9e7840a 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -45,7 +45,7 @@ #ifndef ASSEMBLER -static void __inline blas_lock(volatile unsigned long *address){ +static __inline void blas_lock(volatile unsigned long *address){ long int ret = 1; diff --git a/common_thread.h b/common_thread.h index 05e1d5489..d37fcb189 100644 --- a/common_thread.h +++ b/common_thread.h @@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads); /* Global Parameter */ extern int blas_cpu_number; extern int blas_num_threads; -extern int blas_num_threads_set; extern int blas_omp_linked; #define BLAS_LEGACY 0x8000U @@ -112,8 +111,9 @@ typedef struct blas_queue { struct blas_queue *next; #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) - CRITICAL_SECTION lock; - HANDLE finish; + // CRITICAL_SECTION lock; + // HANDLE finish; + volatile int finished; #else pthread_mutex_t lock; pthread_cond_t finished; @@ -136,27 +136,32 @@ typedef struct blas_queue { #ifdef SMP_SERVER extern int blas_server_avail; +extern int blas_omp_number_max; +extern int blas_omp_threads_local; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads; - if (blas_num_threads_set == 0) openmp_nthreads=omp_get_max_threads(); - else - openmp_nthreads=blas_cpu_number; + if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local; #endif #ifndef USE_OPENMP if (blas_cpu_number == 1 -#endif -#ifdef USE_OPENMP - if (openmp_nthreads == 1 || omp_in_parallel() +#else + if (openmp_nthreads == 1 #endif ) return 1; #ifdef USE_OPENMP - if (blas_cpu_number != openmp_nthreads) { + if (openmp_nthreads > blas_omp_number_max){ +#ifdef DEBUG + fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max); +#endif + openmp_nthreads = blas_omp_number_max; + } + if (blas_cpu_number != openmp_nthreads) { goto_set_num_threads(openmp_nthreads); } #endif @@ -189,27 +194,27 @@ int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer); int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int threads); + void *c, BLASLONG ldc, int (*function)(void), int threads); -int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); +int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *, void *, BLASLONG); -int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); +int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG), void *, void *, BLASLONG); -int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); +int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG); -int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG); +int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG, BLASLONG); int trsm_thread(int mode, BLASLONG m, BLASLONG n, double alpha_r, double alpha_i, void *a, BLASLONG lda, - void *c, BLASLONG ldc, int (*function)(), void *buffer); + void *c, BLASLONG ldc, int (*function)(void), void *buffer); -int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); +int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void*, void*, BLASLONG); int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *offsetA, BLASLONG lda, void *offsetB, BLASLONG jb, - void *ipiv, BLASLONG offset, int (*function)(), void *buffer); + void *ipiv, BLASLONG offset, int (*function)(void), void *buffer); #endif /* ENDIF ASSEMBLER */ diff --git a/common_x86.h b/common_x86.h index bc77eca58..65fb9a460 100644 --- a/common_x86.h +++ b/common_x86.h @@ -54,7 +54,7 @@ #define __volatile__ #endif -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ int ret; diff --git a/common_x86_64.h b/common_x86_64.h index 729a055ce..dda168d6c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -70,7 +70,7 @@ #define RMB #endif -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ #ifndef C_MSVC diff --git a/common_zarch.h b/common_zarch.h index 442bae821..80609251b 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER /* -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ BLASULONG ret; diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 1080ea974..8c5d04c14 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -267,8 +267,10 @@ int detect(void) } #else #ifdef __APPLE__ - sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; + sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); + if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 + if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 + if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 #endif return CPU_ARMV8; #endif diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index ca07c7ffb..0ad32ae4e 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include +#include /* If LASX extension instructions supported, * using core LOONGSON3R5 @@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_LOONGSON3R5 1 #define CPU_LOONGSON2K1000 2 -#define LOONGARCH_CFG2 0x02 -#define LOONGARCH_LASX 1<<7 -#define LOONGARCH_LSX 1<<6 +#define LA_HWCAP_LSX (1U << 4) +#define LA_HWCAP_LASX (1U << 5) static char *cpuname[] = { "LOONGSONGENERIC", @@ -64,17 +64,11 @@ static char *cpuname_lower[] = { int detect(void) { #ifdef __linux - uint32_t reg = 0; + int hwcap = (int)getauxval(AT_HWCAP); - __asm__ volatile ( - "cpucfg %0, %1 \n\t" - : "+&r"(reg) - : "r"(LOONGARCH_CFG2) - ); - - if (reg & LOONGARCH_LASX) + if (hwcap & LA_HWCAP_LASX) return CPU_LOONGSON3R5; - else if (reg & LOONGARCH_LSX) + else if (hwcap & LA_HWCAP_LSX) return CPU_LOONGSON2K1000; else return CPU_GENERIC; @@ -100,7 +94,9 @@ void get_subdirname(void) { } void get_cpuconfig(void) { + uint32_t hwcaps = 0; int d = detect(); + switch (d) { case CPU_LOONGSON3R5: printf("#define LOONGSON3R5\n"); @@ -135,6 +131,10 @@ void get_cpuconfig(void) { printf("#define L2_ASSOCIATIVE 16\n"); break; } + + hwcaps = (uint32_t)getauxval( AT_HWCAP ); + if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n"); + if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n"); } void get_libname(void){ diff --git a/cpuid_power.c b/cpuid_power.c index 2526e8d0e..1ced8930a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -160,6 +160,7 @@ int detect(void){ infoCount = HOST_BASIC_INFO_COUNT; host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount); + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 894d2b873..ff7ba2aad 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_GENERIC 0 -#define CPU_C910V 1 +#define CPU_GENERIC 0 +#define CPU_C910V 1 +#define CPU_x280 2 +#define CPU_RISCV64_ZVL256B 3 +#define CPU_RISCV64_ZVL128B 4 static char *cpuname[] = { "RISCV64_GENERIC", - "C910V" + "C910V", + "x280", + "CPU_RISCV64_ZVL256B", + "CPU_RISCV64_ZVL128B" +}; + +static char *cpuname_lower[] = { + "riscv64_generic", + "c910v", + "x280", + "riscv64_zvl256b", + "riscv64_zvl128b" }; int detect(void){ @@ -86,23 +100,29 @@ int detect(void){ char *pmodel = NULL, *pisa = NULL; infile = fopen("/proc/cpuinfo", "r"); + if (!infile) + return CPU_GENERIC; while (fgets(buffer, sizeof(buffer), infile)){ if(!strncmp(buffer, "model name", 10)){ strcpy(model_buffer, buffer); - pmodel = strchr(isa_buffer, ':') + 1; + pmodel = strchr(model_buffer, ':'); + if (pmodel) + pmodel++; } if(!strncmp(buffer, "isa", 3)){ strcpy(isa_buffer, buffer); - pisa = strchr(isa_buffer, '4') + 1; + pisa = strchr(isa_buffer, '4'); + if (pisa) + pisa++; } } fclose(infile); - if (!pmodel) + if (!pmodel || !pisa) return(CPU_GENERIC); - + if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) return CPU_C910V; @@ -140,5 +160,5 @@ void get_cpuconfig(void){ } void get_libname(void){ - printf("riscv64\n"); + printf("%s", cpuname_lower[detect()]); } diff --git a/cpuid_x86.c b/cpuid_x86.c index ad13a8c8c..6cf4d6503 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -194,7 +194,7 @@ static C_INLINE void xgetbv(int op, int * eax, int * edx){ } #endif -int support_avx(){ +int support_avx(void){ #ifndef NO_AVX int eax, ebx, ecx, edx; int ret=0; @@ -212,7 +212,7 @@ int support_avx(){ #endif } -int support_avx2(){ +int support_avx2(void){ #ifndef NO_AVX2 int eax, ebx, ecx=0, edx; int ret=0; @@ -228,7 +228,7 @@ int support_avx2(){ #endif } -int support_avx512(){ +int support_avx512(void){ #if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; @@ -250,7 +250,7 @@ int support_avx512(){ #endif } -int support_avx512_bf16(){ +int support_avx512_bf16(void){ #if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; @@ -271,7 +271,7 @@ int support_avx512_bf16(){ #define BIT_AMX_BF16 0x00400000 #define BIT_AMX_ENBD 0x00060000 -int support_amx_bf16() { +int support_amx_bf16(void) { #if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; @@ -1479,6 +1479,8 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; case 15: // Sapphire Rapids + if(support_amx_bf16()) + return CPUTYPE_SAPPHIRERAPIDS; if(support_avx512_bf16()) return CPUTYPE_COOPERLAKE; if(support_avx512()) @@ -1549,6 +1551,7 @@ int get_cpuname(void){ case 7: // Raptor Lake case 10: case 15: + case 14: // Alder Lake N if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) @@ -1657,7 +1660,13 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } - case 10: // Zen3 + case 10: // Zen3/4 +#ifndef NO_AVX512 + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; +#endif if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; @@ -1845,7 +1854,8 @@ static char *cpuname[] = { "ZEN", "SKYLAKEX", "DHYANA", - "COOPERLAKE" + "COOPERLAKE", + "SAPPHIRERAPIDS", }; static char *lowercpuname[] = { @@ -1902,7 +1912,8 @@ static char *lowercpuname[] = { "zen", "skylakex", "dhyana", - "cooperlake" + "cooperlake", + "sapphirerapids", }; static char *corename[] = { @@ -1936,7 +1947,8 @@ static char *corename[] = { "ZEN", "SKYLAKEX", "DHYANA", - "COOPERLAKE" + "COOPERLAKE", + "SAPPHIRERAPIDS", }; static char *corename_lower[] = { @@ -1970,7 +1982,8 @@ static char *corename_lower[] = { "zen", "skylakex", "dhyana", - "cooperlake" + "cooperlake", + "sapphirerapids", }; @@ -2276,16 +2289,18 @@ int get_coretype(void){ return CORE_NEHALEM; } if (model == 15) { // Sapphire Rapids + if(support_amx_bf16()) + return CORE_SAPPHIRERAPIDS; if(support_avx512_bf16()) - return CPUTYPE_COOPERLAKE; + return CORE_COOPERLAKE; if(support_avx512()) - return CPUTYPE_SKYLAKEX; + return CORE_SKYLAKEX; if(support_avx2()) - return CPUTYPE_HASWELL; + return CORE_HASWELL; if(support_avx()) - return CPUTYPE_SANDYBRIDGE; + return CORE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; + return CORE_NEHALEM; } break; @@ -2352,6 +2367,7 @@ int get_coretype(void){ case 7: // Raptor Lake case 10: case 15: + case 14: // Alder Lake N #ifndef NO_AVX2 if(support_avx2()) return CORE_HASWELL; @@ -2428,6 +2444,12 @@ int get_coretype(void){ // Ryzen 2 default: // Matisse,Renoir Ryzen2 models +#ifndef NO_AVX512 + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + if(support_avx512()) + return CORE_SKYLAKEX; +#endif if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; diff --git a/ctest.c b/ctest.c index 2ccae8dcc..cbc15326f 100644 --- a/ctest.c +++ b/ctest.c @@ -173,6 +173,10 @@ HAVE_C11 ARCH_E2K #endif +#if defined(__csky__) +ARCH_CSKY +#endif + #if defined(__EMSCRIPTEN__) ARCH_RISCV64 OS_WINDOWS diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 91338b73b..c56a78346 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -40,6 +40,10 @@ else() c_${float_char}blas1.c) endif() target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat1 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat1 m) endif() @@ -65,6 +69,10 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat2 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat2 m) endif() @@ -80,6 +88,17 @@ if (NOT NOFORTRAN) auxiliary.c c_xerbla.c constant.c) + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_executable(x${float_char}cblat3_3m + c_${float_char}blat3_3m.f + c_${float_char}blas3_3m.c + c_${float_char}3chke_3m.c + auxiliary.c + c_xerbla.c + constant.c) + endif() + endif() else() add_executable(x${float_char}cblat3 c_${float_char}blat3c.c @@ -88,12 +107,44 @@ else() auxiliary.c c_xerbla.c constant.c) + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_executable(x${float_char}cblat3_3m + c_${float_char}blat3c_3m.c + c_${float_char}blas3_3m.c + c_${float_char}3chke_3m.c + auxiliary.c + c_xerbla.c + constant.c) + endif() + endif() endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat3 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat3 omp pthread) + endif() + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") + target_link_libraries(x${float_char}cblat3_3m m) + endif() + endif() + endif() add_test(NAME "x${float_char}cblat3" COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") - + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_test(NAME "x${float_char}cblat3_3m" + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m") + endif() + endif() endforeach() diff --git a/ctest/Makefile b/ctest/Makefile index 0fb2450d2..bbaf96f8e 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -5,6 +5,24 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system +SUPPORT_GEMM3M = 0 + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + override CFLAGS += -DADD$(BU) -DCBLAS ifeq ($(F_COMPILER),GFORTRAN) override FFLAGS += -fno-tree-vectorize @@ -144,9 +162,15 @@ all3targets += xdcblat3 endif ifeq ($(BUILD_COMPLEX),1) all3targets += xccblat3 +ifeq ($(SUPPORT_GEMM3M),1) +all3targets += xccblat3_3m +endif endif ifeq ($(BUILD_COMPLEX16),1) all3targets += xzcblat3 +ifeq ($(SUPPORT_GEMM3M),1) +all3targets += xzcblat3_3m +endif endif all3: $(all3targets) @@ -181,9 +205,9 @@ endif endif endif -all3_3m: xzcblat3_3m xccblat3_3m +ifeq ($(SUPPORT_GEMM3M),1) ifeq ($(USE_OPENMP), 1) -ifeq ($(BUILD_SINGLE),1) +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m endif ifeq ($(BUILD_COMPLEX16),1) @@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif endif +endif @@ -208,12 +233,20 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB = -lomp +CEXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) CEXTRALIB = -lgomp endif +ifeq ($(F_COMPILER), IBM) +ifeq ($(C_COMPILER), GCC) +CEXTRALIB += -lgomp +endif +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB += -lomp +endif +endif endif ifeq ($(BUILD_SINGLE),1) @@ -263,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(SUPPORT_GEMM3M),1) xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif else xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) @@ -272,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) +ifeq ($(SUPPORT_GEMM3M),1) +xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME) + $(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif endif endif @@ -285,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(SUPPORT_GEMM3M),1) xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif else xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) @@ -294,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) +ifeq ($(SUPPORT_GEMM3M),1) +xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME) + $(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif endif endif diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index 1a123d74d..cad7c7fa7 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -96,7 +96,7 @@ INTEGER ICAMAXTEST EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST * .. External Subroutines .. - EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 + EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. @@ -214,8 +214,8 @@ CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + STRUE4(NP1),SFAC) ELSE IF (ICASE.EQ.8) THEN -* .. CSCAL .. - CALL CSCAL(N,CA,CX,INCX) +* .. CSCALTEST .. + CALL CSCALTEST(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN @@ -236,14 +236,14 @@ * INCX = 1 IF (ICASE.EQ.8) THEN -* CSCAL +* CSCALTEST * Add a test for alpha equal to zero. CA = (0.0E0,0.0E0) DO 80 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 80 CONTINUE - CALL CSCAL(5,CA,CX,INCX) + CALL CSCALTEST(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * CSSCALTEST diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index 8c0dd140c..d9a539097 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -242,251 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -#if 0 -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -396,7 +273,7 @@ static integer c_n1 = -1; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program */ int main() +/* Main program */ int main(void) { /* Initialized data */ @@ -414,17 +291,21 @@ static logical c_false = FALSE_; static logical same; static integer ninc, nbet, ntra; static logical rewi; - extern /* Subroutine */ int cchk1_(), cchk2_(), cchk3_(), cchk4_(), - cchk5_(), cchk6_(); + extern /* Subroutine */ int cchk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, integer*, ftnlen); + extern /* Subroutine */ int cchk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, integer*, ftnlen); + extern /* Subroutine */ int cchk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen); + extern /* Subroutine */ int cchk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen); + extern /* Subroutine */ int cchk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen); + extern /* Subroutine */ int cchk6_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen); static complex a[4225] /* was [65][65] */; static real g[65]; static integer i__, j, n; static logical fatal; static complex x[65], y[65], z__[130]; - extern doublereal sdiff_(); + extern doublereal sdiff_(real*, real*); static logical trace; static integer nidim; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static char snaps[32], trans[1]; static integer isnum; static logical ltest[17]; @@ -438,11 +319,11 @@ static logical c_false = FALSE_; static char snamet[12]; static real thresh; static logical rorder; - extern /* Subroutine */ int cc2chke_(); + extern /* Subroutine */ void cc2chke_(char*, ftnlen); static integer layout; static logical ltestt, tsterr; static complex alf[7]; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static integer inc[7], nkb; static complex bet[7]; static real eps, err; @@ -983,22 +864,7 @@ L240: } /* MAIN__ */ -/* Subroutine */ int cchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -complex *alf; -integer *nbet; -complex *bet; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, complex* alf, integer* nbet, complex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1015,10 +881,10 @@ ftnlen sname_len; static integer incx, incy; static logical full, tran, null; static integer i__, m, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static complex alpha; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static integer incxs, incys; @@ -1026,14 +892,15 @@ ftnlen sname_len; static integer ia, ib, ic; static logical banded; static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns; - extern /* Subroutine */ int ccgbmv_(), ccgemv_(); - extern logical lceres_(); + extern /* Subroutine */ int ccgbmv_(integer*, char*, integer*, integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen); + extern /* Subroutine */ void ccgemv_(integer*, char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); static char ctrans[14]; static real errmax; static complex transl; static char transs[1]; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static complex als, bls; static real err; static integer iku, kls, kus; @@ -1448,22 +1315,7 @@ L140: } /* cchk1_ */ -/* Subroutine */ int cchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -complex *alf; -integer *nbet; -complex *bet; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, complex* alf, integer* nbet, complex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1481,10 +1333,10 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, k, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static complex alpha; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -1495,13 +1347,14 @@ ftnlen sname_len; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, iy, ns, lx, ly; - extern /* Subroutine */ int cchbmv_(), cchemv_(); - extern logical lceres_(); - extern /* Subroutine */ int cchpmv_(); + extern /* Subroutine */ void cchbmv_(integer*, char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen); + extern /* Subroutine */ void cchemv_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cchpmv_(integer*, char*, integer*, complex*, complex*, complex*, integer*, complex*, complex*, integer*, ftnlen); static real errmax; static complex transl; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static complex als, bls; static real err; @@ -1906,19 +1759,7 @@ L130: } /* cchk2_ */ -/* Subroutine */ int cchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, xt, g, z__, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *xt; -real *g; -complex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* xt, real* g, complex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1937,10 +1778,10 @@ ftnlen sname_len; static logical full, null; static char uplo[1], cdiag[14]; static integer i__, k, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static char diags[1]; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -1950,17 +1791,19 @@ ftnlen sname_len; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, ns, lx; - extern logical lceres_(); - extern /* Subroutine */ int cctbmv_(), cctbsv_(); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cctbmv_(integer*, char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cctbsv_(integer*, char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen); static char ctrans[14]; - extern /* Subroutine */ int cctpmv_(); + extern /* Subroutine */ void cctpmv_(integer*, char*, char*, char*, integer*, complex*, complex*, integer*, ftnlen, ftnlen, ftnlen); static real errmax; - extern /* Subroutine */ int cctrmv_(), cctpsv_(); + extern /* Subroutine */ void cctrmv_(integer*, char*, char*, char*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cctpsv_(integer*, char*, char*, char*, integer*, complex*, complex*, integer*, ftnlen, ftnlen, ftnlen); static complex transl; - extern /* Subroutine */ int cctrsv_(); + extern /* Subroutine */ void cctrsv_(integer*, char*, char*, char*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen); static char transs[1]; static integer laa, icd, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static integer ict, icu; static real err; @@ -2418,21 +2261,7 @@ L130: } /* cchk3_ */ -/* Subroutine */ int cchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -complex *alf; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -complex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; @@ -2444,21 +2273,21 @@ ftnlen sname_len; static integer incx, incy; static logical null; static integer i__, j, m, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static complex alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static integer incxs, incys, ia, nc, nd, im, in; - extern /* Subroutine */ int ccgerc_(); + extern /* Subroutine */ void ccgerc_(integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*); static integer ms, ix, iy, ns, lx, ly; - extern /* Subroutine */ int ccgeru_(); - extern logical lceres_(); + extern /* Subroutine */ void ccgeru_(integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); static real errmax; static complex transl; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static complex als; static real err; @@ -2786,21 +2615,7 @@ L150: } /* cchk4_ */ -/* Subroutine */ int cchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -complex *alf; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -complex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2818,10 +2633,12 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, j, n; - extern /* Subroutine */ int cmake_(), ccher_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void ccher_(integer*, char*, integer*, real*, complex*, integer*, complex*, integer*, ftnlen); static complex alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int cchpr_(), cmvch_(); + extern /* Subroutine */ void cchpr_(integer*, char*, integer*, real*, complex*, integer*, complex*, ftnlen); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -2832,11 +2649,11 @@ ftnlen sname_len; static logical packed; static integer ix, ns, lx; static real ralpha; - extern logical lceres_(); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); static real errmax; static complex transl; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static real err; /* Tests CHER and CHPR. */ @@ -3160,21 +2977,7 @@ L130: } /* cchk5_ */ -/* Subroutine */ int cchk6_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -complex *alf; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -complex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk6_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -3192,25 +2995,26 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, j, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static complex alpha, w[2]; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs, incys; static logical upper; static char uplos[1]; - extern /* Subroutine */ int ccher2_(), cchpr2_(); + extern /* Subroutine */ void ccher2_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*, ftnlen); + extern /* Subroutine */ void cchpr2_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, ftnlen); static integer ia, ja, ic, nc, jj, lj, in; static logical packed; static integer ix, iy, ns, lx, ly; - extern logical lceres_(); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); static real errmax; static complex transl; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static complex als; static real err; @@ -3597,24 +3401,7 @@ L170: } /* cchk6_ */ -/* Subroutine */ int cmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, - incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len) -char *trans; -integer *m, *n; -complex *alpha, *a; -integer *nmax; -complex *x; -integer *incx; -complex *beta, *y; -integer *incy; -complex *yt; -real *g; -complex *yy; -real *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen trans_len; +/* Subroutine */ int cmvch_(char* trans, integer* m, integer* n, complex* alpha, complex* a, integer* nmax, complex* x, integer* incx, complex* beta, complex* y, integer* incy, complex* yt, real* g, complex* yy, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len) { /* System generated locals */ @@ -3812,9 +3599,7 @@ L80: } /* cmvch_ */ -logical lce_(ri, rj, lr) -complex *ri, *rj; -integer *lr; +logical lce_(complex* ri, complex* rj, integer* lr) { /* System generated locals */ integer i__1, i__2, i__3; @@ -3861,13 +3646,7 @@ L30: } /* lce_ */ -logical lceres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -complex *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lceres_(char* type__, char* uplo, integer* m, integer* n, complex* aa, complex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; @@ -3960,9 +3739,7 @@ L80: } /* lceres_ */ -/* Complex */ VOID cbeg_( ret_val, reset) -complex * ret_val; -logical *reset; +/* Complex */ VOID cbeg_(complex* ret_val, logical* reset) { /* System generated locals */ real r__1, r__2; @@ -4023,8 +3800,7 @@ L10: } /* cbeg_ */ -doublereal sdiff_(x, y) -real *x, *y; +doublereal sdiff_(real* x, real* y) { /* System generated locals */ real ret_val; @@ -4044,19 +3820,7 @@ real *x, *y; } /* sdiff_ */ -/* Subroutine */ int cmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, - ku, reset, transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -complex *a; -integer *nmax; -complex *aa; -integer *lda, *kl, *ku; -logical *reset; -complex *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int cmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, complex* a, integer* nmax, complex* aa, integer* lda, integer* kl, integer* ku, logical* reset, complex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; @@ -4064,7 +3828,7 @@ ftnlen diag_len; complex q__1, q__2; /* Local variables */ - extern /* Complex */ VOID cbeg_(); + extern /* Complex */ VOID cbeg_(complex*, logical*); static integer ibeg, iend, ioff; static logical unit; static integer i__, j; diff --git a/ctest/c_cblat3c.c b/ctest/c_cblat3c.c index 1f4b967b0..5ad9b8bd8 100644 --- a/ctest/c_cblat3c.c +++ b/ctest/c_cblat3c.c @@ -242,130 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - - /* Common Block Declarations */ diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c new file mode 100644 index 000000000..b5d6bf9cb --- /dev/null +++ b/ctest/c_cblat3c_3m.c @@ -0,0 +1,3942 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#include "common.h" + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 + + +/* Common Block Declarations */ + +struct { + integer infot, noutc; + logical ok, lerr; +} infoc_; + +#define infoc_1 infoc_ + +struct { + char srnamt[12]; +} srnamc_; + +#define srnamc_1 srnamc_ + +/* Table of constant values */ + +static complex c_b1 = {0.f,0.f}; +static complex c_b2 = {1.f,0.f}; +static integer c__1 = 1; +static integer c__65 = 65; +static integer c__6 = 6; +static real c_b91 = 1.f; +static logical c_true = TRUE_; +static integer c__0 = 0; +static logical c_false = FALSE_; + +int /* Main program */ main(void) +{ + /* Initialized data */ + + static char snames[9][13] = {"cblas_cgemm3m ", "cblas_chemm ", "cblas_csymm ", + "cblas_ctrmm ", "cblas_ctrsm ", "cblas_cherk ", "cblas_csyrk ", + "cblas_cher2k", "cblas_csyr2k"}; + + /* System generated locals */ + integer i__1, i__2, i__3, i__4, i__5; + real r__1; + + /* Local variables */ + integer nalf, idim[9]; + logical same; + integer nbet, ntra; + logical rewi; + extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, real *, integer *), + cchk2_(char *, real *, real *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, complex *, + integer *, complex *, integer *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, real *, integer *), cchk3_(char *, real *, + real *, integer *, integer *, logical *, logical *, logical *, + integer *, integer *, integer *, complex *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + real *, complex *, integer *), cchk4_(char *, real *, + real *, integer *, integer *, logical *, logical *, logical *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, real *, + integer *), cchk5_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, real *, complex *, integer *); + complex c__[4225] /* was [65][65] */; + real g[65]; + integer i__, j, n; + logical fatal; + complex w[130]; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + extern real sdiff_(real *, real *); + logical trace; + integer nidim; + char snaps[32]; + integer isnum; + logical ltest[9]; + complex aa[4225], ab[8450] /* was [65][130] */, bb[4225], cc[4225], as[ + 4225], bs[4225], cs[4225], ct[65]; + logical sfatal, corder; + char snamet[12], transa[1], transb[1]; + real thresh; + logical rorder; + extern /* Subroutine */ int cc3chke_(char *); + integer layout; + logical ltestt, tsterr; + complex alf[7]; + extern logical lce_(complex *, complex *, integer *); + complex bet[7]; + real eps, err; + char tmpchar; + +/* Test program for the COMPLEX Level 3 Blas. */ + +/* The program must be driven by a short data file. The first 13 records */ +/* of the file are read using list-directed input, the last 9 records */ +/* are read using the format ( A12, L2 ). An annotated example of a data */ +/* file can be obtained by deleting the first 3 characters from the */ +/* following 22 lines: */ +/* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE */ +/* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */ +/* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */ +/* F LOGICAL FLAG, T TO STOP ON FAILURES. */ +/* T LOGICAL FLAG, T TO TEST ERROR CALL MYEXITS. */ +/* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */ +/* 16.0 THRESHOLD VALUE OF TEST RATIO */ +/* 6 NUMBER OF VALUES OF N */ +/* 0 1 2 3 5 9 VALUES OF N */ +/* 3 NUMBER OF VALUES OF ALPHA */ +/* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA */ +/* 3 NUMBER OF VALUES OF BETA */ +/* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA */ +/* cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. */ + +/* See: */ + +/* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. */ +/* A Set of Level 3 Basic Linear Algebra Subprograms. */ + +/* Technical Memorandum No.88 (Revision 1), Mathematics and */ +/* Computer Science Division, Argonne National Laboratory, 9700 */ +/* South Cass Avenue, Argonne, Illinois 60439, US. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + + infoc_1.noutc = 6; + +/* Read name and unit number for snapshot output file and open file. */ + char line[80]; + + fgets(line,80,stdin); + sscanf(line,"'%s'",snaps); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&ntra); +#else + sscanf(line,"%d",&ntra); +#endif + trace = ntra >= 0; + if (trace) { +/* o__1.oerr = 0; + o__1.ounit = ntra; + o__1.ofnmlen = 32; + o__1.ofnm = snaps; + o__1.orl = 0; + o__1.osta = 0; + o__1.oacc = 0; + o__1.ofm = 0; + o__1.oblnk = 0; + f_open(&o__1);*/ + } +/* Read the flag that directs rewinding of the snapshot file. */ + fgets(line,80,stdin); + sscanf(line,"%d",&rewi); + rewi = rewi && trace; +/* Read the flag that directs stopping on any failure. */ + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + sfatal=FALSE_; + if (tmpchar=='T')sfatal=TRUE_; + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + tsterr=FALSE_; + if (tmpchar=='T')tsterr=TRUE_; + fgets(line,80,stdin); + sscanf(line,"%d",&layout); + fgets(line,80,stdin); + sscanf(line,"%f",&thresh); + + +/* Read and check the parameter values for the tests. */ + +/* Values of N */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nidim); +#else + sscanf(line,"%d",&nidim); +#endif + + if (nidim < 1 || nidim > 9) { + fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9"); + goto L220; + } + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#else + sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#endif + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n"); + goto L220; + } +/* L10: */ + } +/* Values of ALPHA */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nalf); +#else + sscanf(line,"%d",&nalf); +#endif + if (nalf < 1 || nalf > 7) { + fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; + } + fgets(line,80,stdin); + sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i, + &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i); + +// i__1 = nalf; +// for (i__ = 1; i__ <= i__1; ++i__) { +// do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex)); +// } +/* Values of BETA */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nbet); +#else + sscanf(line,"%d",&nbet); +#endif + if (nalf < 1 || nbet > 7) { + fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; + } + fgets(line,80,stdin); + sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i, + &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i); + + +/* Report values of parameters. */ + + printf("TESTS OF THE COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n"); + printf(" FOR N"); + for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]); + printf("\n"); + printf(" FOR ALPHA"); + for (i__ =1; i__ <=nalf;++i__) printf(" (%f,%f)",alf[i__-1].r,alf[i__-1].i); + printf("\n"); + printf(" FOR BETA"); + for (i__ =1; i__ <=nbet;++i__) printf(" (%f,%f)",bet[i__-1].r,bet[i__-1].i); + printf("\n"); + + if (! tsterr) { + printf(" ERROR-EXITS WILL NOT BE TESTED\n"); + } + printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %f\n",thresh); + rorder = FALSE_; + corder = FALSE_; + if (layout == 2) { + rorder = TRUE_; + corder = TRUE_; + printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n"); + } else if (layout == 1) { + rorder = TRUE_; + printf("ROW-MAJOR DATA LAYOUT IS TESTED\n"); + } else if (layout == 0) { + corder = TRUE_; + printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n"); + } + +/* Read names of subroutines and flags which indicate */ +/* whether they are to be tested. */ + + for (i__ = 1; i__ <= 9; ++i__) { + ltest[i__ - 1] = FALSE_; +/* L20: */ + } +L30: + if (! fgets(line,80,stdin)) { + goto L60; + } + i__1 = sscanf(line,"%12c %c",snamet,&tmpchar); + ltestt=FALSE_; + if (tmpchar=='T')ltestt=TRUE_; + if (i__1 < 2) { + goto L60; + } + for (i__ = 1; i__ <= 9; ++i__) { + if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == + 0) { + goto L50; + } +/* L40: */ + } + printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet); + exit(1); +L50: + ltest[i__ - 1] = ltestt; + goto L30; + +L60: +/* cl__1.cerr = 0; + cl__1.cunit = 5; + cl__1.csta = 0; + f_clos(&cl__1);*/ + +/* Compute EPS (the machine precision). */ + + eps = 1.f; +L70: + r__1 = eps + 1.f; + if (sdiff_(&r__1, &c_b91) == 0.f) { + goto L80; + } + eps *= .5f; + goto L70; +L80: + eps += eps; + printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps); + +/* Check the reliability of CMMCH using exact data. */ + + n = 32; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * 65 - 66; +/* Computing MAX */ + i__5 = i__ - j + 1; + i__4 = f2cmax(i__5,0); + ab[i__3].r = (real) i__4, ab[i__3].i = 0.f; +/* L90: */ + } + i__2 = j + 4224; + ab[i__2].r = (real) j, ab[i__2].i = 0.f; + i__2 = (j + 65) * 65 - 65; + ab[i__2].r = (real) j, ab[i__2].i = 0.f; + i__2 = j - 1; + c__[i__2].r = 0.f, c__[i__2].i = 0.f; +/* L100: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j - 1; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (real) i__3, cc[i__2].i = 0.f; +/* L110: */ + } +/* CC holds the exact result. On exit from CMMCH CT holds */ +/* the result computed by CMMCH. */ + *(unsigned char *)transa = 'N'; + *(unsigned char *)transb = 'N'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + *(unsigned char *)transb = 'C'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j + 4224; + i__3 = n - j + 1; + ab[i__2].r = (real) i__3, ab[i__2].i = 0.f; + i__2 = (j + 65) * 65 - 65; + i__3 = n - j + 1; + ab[i__2].r = (real) i__3, ab[i__2].i = 0.f; +/* L120: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n - j; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (real) i__3, cc[i__2].i = 0.f; +/* L130: */ + } + *(unsigned char *)transa = 'C'; + *(unsigned char *)transb = 'N'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + *(unsigned char *)transb = 'C'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + +/* Test each subroutine in turn. */ + + for (isnum = 1; isnum <= 9; ++isnum) { + if (! ltest[isnum - 1]) { +/* Subprogram is not to be tested. */ + printf("%12s WAS NOT TESTED\n",snames[isnum-1]); + } else { + s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, ( + ftnlen)12); +/* Test error exits. */ + if (tsterr) { + cc3chke_(snames[isnum - 1]); + } +/* Test computations. */ + infoc_1.infot = 0; + infoc_1.ok = TRUE_; + fatal = FALSE_; + switch (isnum) { + case 1: goto L140; + case 2: goto L150; + case 3: goto L150; + case 4: goto L160; + case 5: goto L160; + case 6: goto L170; + case 7: goto L170; + case 8: goto L180; + case 9: goto L180; + } +/* Test CGEMM, 01. */ +L140: + if (corder) { + cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CHEMM, 02, CSYMM, 03. */ +L150: + if (corder) { + cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CTRMM, 04, CTRSM, 05. */ +L160: + if (corder) { + cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__0); + } + if (rorder) { + cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__1); + } + goto L190; +/* Test CHERK, 06, CSYRK, 07. */ +L170: + if (corder) { + cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CHER2K, 08, CSYR2K, 09. */ +L180: + if (corder) { + cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__0); + } + if (rorder) { + cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__1); + } + goto L190; + +L190: + if (fatal && sfatal) { + goto L210; + } + } +/* L200: */ + } + printf("\nEND OF TESTS\n"); + goto L230; + +L210: + printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n"); + goto L230; + +L220: + printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n"); + printf("****** TESTS ABANDONED ******\n"); +L230: + if (trace) { +/* cl__1.cerr = 0; + cl__1.cunit = ntra; + cl__1.csta = 0; + f_clos(&cl__1);*/ + } +/* cl__1.cerr = 0; + cl__1.cunit = 6; + cl__1.csta = 0; + f_clos(&cl__1); + s_stop("", (ftnlen)0);*/ + exit(0); + +/* End of CBLAT3. */ + + return 0; +} /* MAIN__ */ + +/* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char ich[3] = "NTC"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7, i__8; + + /* Local variables */ + complex beta; + integer ldas, ldbs, ldcs; + logical same, null; + integer i__, k, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13], trana, tranb; + integer nargs; + logical reset; + extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, integer *, complex *, + integer *, integer *, complex *, integer *); + integer ia, ib, ma, mb, na, nb, nc, ik, im, in; + extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *, complex *, complex *, integer *); + integer ks, ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + char tranas[1], tranbs[1], transa[1], transb[1]; + real errmax; + integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + complex als, bls; + real err; + +/* Tests CGEMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 13; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; + + i__3 = *nidim; + for (ik = 1; ik <= i__3; ++ik) { + k = idim[ik]; + + for (ica = 1; ica <= 3; ++ica) { + *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] + ; + trana = *(unsigned char *)transa == 'T' || *(unsigned + char *)transa == 'C'; + + if (trana) { + ma = k; + na = m; + } else { + ma = m; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ + 1], &lda, &reset, &c_b1); + + for (icb = 1; icb <= 3; ++icb) { + *(unsigned char *)transb = *(unsigned char *)&ich[icb + - 1]; + tranb = *(unsigned char *)transb == 'T' || *(unsigned + char *)transb == 'C'; + + if (tranb) { + mb = n; + nb = k; + } else { + mb = k; + nb = n; + } +/* Set LDB to 1 more than minimum value if room. */ + ldb = mb; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L70; + } + lbb = ldb * nb; + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & + bb[1], &ldb, &reset, &c_b1); + + i__4 = *nalf; + for (ia = 1; ia <= i__4; ++ia) { + i__5 = ia; + alpha.r = alf[i__5].r, alpha.i = alf[i__5].i; + + i__5 = *nbet; + for (ib = 1; ib <= i__5; ++ib) { + i__6 = ib; + beta.r = bet[i__6].r, beta.i = bet[i__6].i; + +/* Generate the matrix C. */ + + cmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)tranbs = *(unsigned char *) + transb; + ms = m; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__6 = laa; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + as[i__7].r = aa[i__8].r, as[i__7].i = aa[ + i__8].i; +/* L10: */ + } + ldas = lda; + i__6 = lbb; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[ + i__8].i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__6 = lcc; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[ + i__8].i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + cprcn1_(ntra, &nc, sname, iorder, transa, + transb, &m, &n, &k, &alpha, &lda, + &ldb, &beta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); */ + } + ccgemm3m_(iorder, transa, transb, &m, &n, &k, & + alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc); + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { +// io___128.ciunit = *nout; +// s_wsfe(&io___128); +// e_wsfe(); + printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)transa == *( + unsigned char *)tranas; + isame[1] = *(unsigned char *)transb == *( + unsigned char *)tranbs; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = ks == k; + isame[5] = als.r == alpha.r && als.i == + alpha.i; + isame[6] = lce_(&as[1], &aa[1], &laa); + isame[7] = ldas == lda; + isame[8] = lce_(&bs[1], &bb[1], &lbb); + isame[9] = ldbs == ldb; + isame[10] = bls.r == beta.r && bls.i == + beta.i; + if (null) { + isame[11] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[11] = lceres_("ge", " ", &m, &n, & + cs[1], &cc[1], &ldc); + } + isame[12] = ldcs == ldc; + +/* If data was incorrectly changed, report */ +/* and return. */ + + same = TRUE_; + i__6 = nargs; + for (i__ = 1; i__ <= i__6; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);; + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result. */ + + cmmch_(transa, transb, &m, &n, &k, &alpha, + &a[a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], + nmax, &ct[1], &g[1], &cc[1], &ldc, + eps, &err, fatal, nout, &c_true); + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L120; + } + } + +/* L50: */ + } + +/* L60: */ + } + +L70: + ; + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* L110: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L130; + +L120: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & + lda, &ldb, &beta, &ldc); + +L130: + return 0; + +/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */ +/* $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */ +/* $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */ + +/* End of CCHK1. */ + +} /* cchk1_ */ + + +/* Subroutine */ int cprcn1_(integer *nout, integer *nc, char *sname, integer + *iorder, char *transa, char *transb, integer *m, integer *n, integer * + k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer + *ldc) +{ + /* Local variables */ + char crc[14], cta[14], ctb[14]; + + if (*(unsigned char *)transa == 'N') { + s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(cta, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transb == 'N') { + s_copy(ctb, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transb == 'T') { + s_copy(ctb, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb); + printf("%d %d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + return 0; +} /* cprcn1_ */ + + +/* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char ichs[2] = "LR"; + static char ichu[2] = "UL"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + + /* Local variables */ + complex beta; + integer ldas, ldbs, ldcs; + logical same; + char side[1]; + logical conj, left, null; + char uplo[1]; + integer i__, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13]; + char sides[1]; + integer nargs; + logical reset; + char uplos[1]; + extern /* Subroutine */ int cprcn2_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + integer *, complex *, integer *); + integer ia, ib, na, nc, im, in; + extern /* Subroutine */ int cchemm_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + integer ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + extern /* Subroutine */ int ccsymm_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + real errmax; + integer laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + integer ics; + complex als, bls; + integer icu; + real err; + +/* Tests CHEMM and CSYMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L90; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L90; + } + lbb = ldb * n; + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & + reset, &c_b1); + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + +/* Generate the hermitian or symmetric matrix A. */ + + cmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, + &aa[1], &lda, &reset, &c_b1); + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + +/* Generate the matrix C. */ + + cmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *)side; + *(unsigned char *)uplos = *(unsigned char *)uplo; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + cprcn2_(ntra, &nc, sname, iorder, side, uplo, + &m, &n, &alpha, &lda, &ldb, &beta, & + ldc) + ; + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + if (conj) { + cchemm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } else { + ccsymm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L110; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *(unsigned + char *)side; + isame[1] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lce_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + isame[9] = bls.r == beta.r && bls.i == beta.i; + if (null) { + isame[10] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lceres_("ge", " ", &m, &n, &cs[1], + &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L110; + } + + if (! null) { + +/* Check the result. */ + + if (left) { + cmmch_("N", "N", &m, &n, &m, &alpha, &a[ + a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } else { + cmmch_("N", "N", &m, &n, &n, &alpha, &b[ + b_offset], nmax, &a[a_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +L90: + ; + } + +/* L100: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L120; + +L110: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, + &beta, &ldc); + +L120: + return 0; + +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ + +/* End of CCHK2. */ + +} /* cchk2_ */ + + +/* Subroutine */ int cprcn2_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, integer *m, integer *n, complex * + alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) +{ + /* Local variables */ + char cs[14], cu[14], crc[14]; + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + return 0; +} /* cprcn2_ */ + + +/* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, + complex *bs, complex *ct, real *g, complex *c__, integer *iorder) +{ + /* Initialized data */ + + static char ichu[2] = "UL"; + static char icht[3] = "NTC"; + static char ichd[2] = "UN"; + static char ichs[2] = "LR"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + complex q__1; + + /* Local variables */ + char diag[1]; + integer ldas, ldbs; + logical same; + char side[1]; + logical left, null; + char uplo[1]; + integer i__, j, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + char diags[1]; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13]; + char sides[1]; + integer nargs; + logical reset; + char uplos[1]; + extern /* Subroutine */ int cprcn3_(integer *, integer *, char *, integer + *, char *, char *, char *, char *, integer *, integer *, complex * + , integer *, integer *); + integer ia, na, nc, im, in, ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + extern /* Subroutine */ int cctrmm_(integer *, char *, char *, char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *); + char tranas[1], transa[1]; + extern /* Subroutine */ int cctrsm_(integer *, char *, char *, char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *); + real errmax; + integer laa, icd, lbb, lda, ldb; + extern logical lce_(complex *, complex *, integer *); + integer ics; + complex als; + integer ict, icu; + real err; + +/* Tests CTRMM and CTRSM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --g; + --ct; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 11; + nc = 0; + reset = TRUE_; + errmax = 0.f; +/* Set up zero matrix for CMMCH. */ + i__1 = *nmax; + for (j = 1; j <= i__1; ++j) { + i__2 = *nmax; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * c_dim1; + c__[i__3].r = 0.f, c__[i__3].i = 0.f; +/* L10: */ + } +/* L20: */ + } + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L130; + } + lbb = ldb * n; + null = m <= 0 || n <= 0; + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L130; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + + for (ict = 1; ict <= 3; ++ict) { + *(unsigned char *)transa = *(unsigned char *)&icht[ + ict - 1]; + + for (icd = 1; icd <= 2; ++icd) { + *(unsigned char *)diag = *(unsigned char *)&ichd[ + icd - 1]; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + +/* Generate the matrix A. */ + + cmake_("tr", uplo, diag, &na, &na, &a[ + a_offset], nmax, &aa[1], &lda, &reset, + &c_b1); + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &m, &n, &b[b_offset], + nmax, &bb[1], &ldb, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *) + side; + *(unsigned char *)uplos = *(unsigned char *) + uplo; + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)diags = *(unsigned char *) + diag; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__4 = laa; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + as[i__5].r = aa[i__6].r, as[i__5].i = aa[ + i__6].i; +/* L30: */ + } + ldas = lda; + i__4 = lbb; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[ + i__6].i; +/* L40: */ + } + ldbs = ldb; + +/* Call the subroutine. */ + + if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen) + 2) == 0) { + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb/*, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1*/); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + cctrmm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( + ftnlen)2) == 0) { + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb/*, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1*/); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + cctrsm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *( + unsigned char *)side; + isame[1] = *(unsigned char *)uplos == *( + unsigned char *)uplo; + isame[2] = *(unsigned char *)tranas == *( + unsigned char *)transa; + isame[3] = *(unsigned char *)diags == *( + unsigned char *)diag; + isame[4] = ms == m; + isame[5] = ns == n; + isame[6] = als.r == alpha.r && als.i == + alpha.i; + isame[7] = lce_(&as[1], &aa[1], &laa); + isame[8] = ldas == lda; + if (null) { + isame[9] = lce_(&bs[1], &bb[1], &lbb); + } else { + isame[9] = lceres_("ge", " ", &m, &n, &bs[ + 1], &bb[1], &ldb); + } + isame[10] = ldbs == ldb; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__4 = nargs; + for (i__ = 1; i__ <= i__4; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L50: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + if (s_cmp(sname + 9, "mm", (ftnlen)2, ( + ftnlen)2) == 0) { + +/* Check the result. */ + + if (left) { + cmmch_(transa, "N", &m, &n, &m, & + alpha, &a[a_offset], nmax, + &b[b_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true/*, ( + ftnlen)1, (ftnlen)1*/); + } else { + cmmch_("N", transa, &m, &n, &n, & + alpha, &b[b_offset], nmax, + &a[a_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true); + } + } else if (s_cmp(sname + 9, "sm", (ftnlen) + 2, (ftnlen)2) == 0) { + +/* Compute approximation to original */ +/* matrix. */ + + i__4 = n; + for (j = 1; j <= i__4; ++j) { + i__5 = m; + for (i__ = 1; i__ <= i__5; ++i__) + { + i__6 = i__ + j * c_dim1; + i__7 = i__ + (j - 1) * ldb; + c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; + i__6 = i__ + (j - 1) * ldb; + i__7 = i__ + j * b_dim1; + q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + q__1.i = alpha.r * b[i__7].i + alpha.i * b[ + i__7].r; + bb[i__6].r = q__1.r, bb[i__6].i = q__1.i; +/* L60: */ + } +/* L70: */ + } + + if (left) { + cmmch_(transa, "N", &m, &n, &m, & + c_b2, &a[a_offset], nmax, + &c__[c_offset], nmax, & + c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } else { + cmmch_("N", transa, &m, &n, &n, & + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L150; + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +/* L110: */ + } + +/* L120: */ + } + +L130: + ; + } + +/* L140: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L160; + +L150: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & + alpha, &lda, &ldb); + } + +L160: + return 0; + +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', */ +/* $ ' .' ) */ + +/* End of CCHK3. */ + +} /* cchk3_ */ + + +/* Subroutine */ int cprcn3_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, + integer *n, complex *alpha, integer *lda, integer *ldb) +{ + /* Local variables */ + char ca[14], cd[14], cs[14], cu[14], crc[14]; + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)diag == 'N') { + s_copy(cd, " CblasNonUnit", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cd, " CblasUnit", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf(" %s %s %d %d (%4.1f,%4.1f) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb); + + return 0; +} /* cprcn3_ */ + + +/* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + complex q__1; + + /* Local variables */ + complex beta; + integer ldas, ldcs; + logical same, conj; + complex bets; + real rals; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + real rbeta; + logical isame[13]; + integer nargs; + real rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + extern /* Subroutine */ int cprcn4_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + complex *, integer *), cprcn6_(integer *, + integer *, char *, integer *, char *, char *, integer *, integer * + , real *, integer *, real *, integer *); + integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks; + extern /* Subroutine */ int ccherk_(integer *, char *, char *, integer *, + integer *, real *, complex *, integer *, real *, complex *, + integer *); + integer ns; + real ralpha; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + real errmax; + extern /* Subroutine */ int ccsyrk_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, complex *, + integer *); + char transs[1], transt[1]; + integer laa, lda, lcc, ldc; + extern logical lce_(complex *, complex *, integer *); + complex als; + integer ict, icu; + real err; + +/* Tests CHERK and CSYRK. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 10; + nc = 0; + reset = TRUE_; + errmax = 0.f; + rals = 1.f; + rbets = 1.f; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & + lda, &reset, &c_b1); + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + if (conj) { + ralpha = alpha.r; + q__1.r = ralpha, q__1.i = 0.f; + alpha.r = q__1.r, alpha.i = q__1.i; + } + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + q__1.r = rbeta, q__1.i = 0.f; + beta.r = q__1.r, beta.i = q__1.i; + } + null = n <= 0; + if (conj) { + null = null || ((k <= 0 || ralpha == 0.f) && + rbeta == 1.f); + } + +/* Generate the matrix C. */ + + cmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + if (conj) { + rals = ralpha; + } else { + als.r = alpha.r, als.i = alpha.i; + } + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L20: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + cprcn6_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &ralpha, &lda, & + rbeta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + ccherk_(iorder, uplo, trans, &n, &k, &ralpha, + &aa[1], &lda, &rbeta, &cc[1], &ldc); + } else { + if (*trace) { + cprcn4_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, & + aa[1], &lda, &beta, &cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + if (conj) { + isame[4] = rals == ralpha; + } else { + isame[4] = als.r == alpha.r && als.i == + alpha.i; + } + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + if (conj) { + isame[7] = rbets == rbeta; + } else { + isame[7] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[8] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[8] = lceres_(sname + 7, uplo, &n, &n, & + cs[1], &cc[1], &ldc); + } + isame[9] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L30: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + cmmch_(transt, "N", &lj, &c__1, &k, & + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + cmmch_("N", transt, &lj, &c__1, &k, & + alpha, &a[jj + a_dim1], nmax, + &a[j + a_dim1], nmax, &beta, & + c__[jj + j * c_dim1], nmax, & + ct[1], &g[1], &cc[jc], &ldc, + eps, &err, fatal, nout, & + c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } +/* L40: */ + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L130; + +L110: + if (n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); + } + +L120: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (conj) { + cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, + &rbeta, &ldc); + } else { + cprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + +L130: + return 0; + +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', */ +/* $ ' .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */ +/* $ '), C,', I3, ') .' ) */ + +/* End of CCHK4. */ + +} /* cchk4_ */ + + +/* Subroutine */ int cprcn4_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, complex *beta, integer *ldc) +{ + /* Local variables */ + char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d (%4.1f,%4.1f) A %d (%4.1f,%4.1f) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc); + return 0; +} /* cprcn4_ */ + + + +/* Subroutine */ int cprcn6_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, real * + alpha, integer *lda, real *beta, integer *ldc) +{ + /* Local variables */ + char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d %4.1f A %d %4.1f C %d\n",*n,*k,*alpha,*lda,*beta,*ldc); + return 0; +} /* cprcn6_ */ + + +/* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex * + as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, + complex *ct, real *g, complex *w, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + + /* System generated locals */ + integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; + complex q__1, q__2; + + /* Local variables */ + integer jjab; + complex beta; + integer ldas, ldbs, ldcs; + logical same, conj; + complex bets; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + real rbeta; + logical isame[13]; + integer nargs; + real rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + extern /* Subroutine */ int cprcn5_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + integer *, complex *, integer *), cprcn7_( + integer *, integer *, char *, integer *, char *, char *, integer * + , integer *, complex *, integer *, integer *, real *, integer *); + integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + real errmax; + char transs[1], transt[1]; + extern /* Subroutine */ int ccher2k_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + real *, complex *, integer *); + integer laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + extern /* Subroutine */ int ccsyr2k_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + complex als; + integer ict, icu; + real err; + +/* Tests CHER2K and CSYR2K. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --w; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + --as; + --aa; + --ab; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L130; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L110; + } + laa = lda * na; + +/* Generate the matrix A. */ + + if (tran) { + i__3 = *nmax << 1; + cmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & + lda, &reset, &c_b1); + } else { + cmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & + lda, &reset, &c_b1); + } + +/* Generate the matrix B. */ + + ldb = lda; + lbb = laa; + if (tran) { + i__3 = *nmax << 1; + cmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] + , &ldb, &reset, &c_b1); + } else { + cmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, + &bb[1], &ldb, &reset, &c_b1); + } + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + q__1.r = rbeta, q__1.i = 0.f; + beta.r = q__1.r, beta.i = q__1.i; + } + null = n <= 0; + if (conj) { + null = null || ((k <= 0 || (alpha.r == 0.f && + alpha.i == 0.f)) && rbeta == 1.f); + } + +/* Generate the matrix C. */ + + cmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + cprcn7_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &rbeta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + ccher2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &rbeta, & + cc[1], &ldc); + } else { + if (*trace) { + cprcn5_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &beta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &beta, & + cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lce_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + if (conj) { + isame[9] = rbets == rbeta; + } else { + isame[9] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[10] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lceres_("he", uplo, &n, &n, &cs[1] + , &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jjab = 1; + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = ((j - 1) << 1) * *nmax + k + + i__; + q__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, + q__1.i = alpha.r * ab[ + i__8].i + alpha.i * ab[ + i__8].r; + w[i__7].r = q__1.r, w[i__7].i = + q__1.i; + if (conj) { + i__7 = k + i__; + r_cnjg(&q__2, &alpha); + i__8 = ((j - 1) << 1) * *nmax + i__; + q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, + q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[ + i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } else { + i__7 = k + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } +/* L50: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + i__8 = *nmax << 1; + cmmch_(transt, "N", &lj, &c__1, &i__6, + &c_b2, &ab[jjab], &i__7, &w[ + 1], &i__8, &beta, &c__[jj + j + * c_dim1], nmax, &ct[1], &g[1] + , &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + if (conj) { + i__7 = i__; + r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]); + q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, + q__1.i = alpha.r * q__2.i + alpha.i * + q__2.r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__2.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + r_cnjg(&q__1, &q__2); + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } else { + i__7 = i__; + i__8 = (k + i__ - 1) * *nmax + j; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } +/* L60: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + cmmch_("N", "N", &lj, &c__1, &i__6, & + c_b2, &ab[jj], nmax, &w[1], & + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + if (tran) { + jjab += *nmax << 1; + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L140; + } +/* L70: */ + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +L110: + ; + } + +/* L120: */ + } + +L130: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L160; + +L140: + if (n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); + } + +L150: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (conj) { + cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &rbeta, &ldc); + } else { + cprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &beta, &ldc); + } + +L160: + return 0; + +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */ +/* $ ', C,', I3, ') .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ + +/* End of CCHK5. */ + +} /* cchk5_ */ + + +/* Subroutine */ int cprcn5_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) +{ + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + return 0; +} /* cprcn5_ */ + + + +/* Subroutine */ int cprcn7_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, integer *ldb, real *beta, integer *ldc) +{ + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1f,%4.1f), A, %d, B, %d, %4.1f, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc); + return 0; +} /* cprcn7_ */ + + +/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, complex *a, integer *nmax, complex *aa, integer *lda, + logical *reset, complex *transl) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + real r__1; + complex q__1, q__2; + + /* Local variables */ + extern /* Complex */ VOID cbeg_(complex *, logical *); + integer ibeg, iend; + logical unit; + integer i__, j; + logical lower, upper; + integer jj; + logical gen, her, tri, sym; + + +/* Generates values for an M by N matrix A. */ +/* Stores the values in the array AA in the data structure required */ +/* by the routine, with unwanted elements set to rogue value. */ + +/* TYPE is 'ge', 'he', 'sy' or 'tr'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --aa; + + /* Function Body */ + gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0; + her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0; + sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0; + tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0; + upper = (her || sym || tri) && *(unsigned char *)uplo == 'U'; + lower = (her || sym || tri) && *(unsigned char *)uplo == 'L'; + unit = tri && *(unsigned char *)diag == 'U'; + +/* Generate data in array A. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + if (gen || (upper && i__ <= j) || (lower && i__ >= j)) { + i__3 = i__ + j * a_dim1; + cbeg_(&q__2, reset); + q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i; + a[i__3].r = q__1.r, a[i__3].i = q__1.i; + if (i__ != j) { +/* Set some elements to zero */ + if (*n > 3 && j == *n / 2) { + i__3 = i__ + j * a_dim1; + a[i__3].r = 0.f, a[i__3].i = 0.f; + } + if (her) { + i__3 = j + i__ * a_dim1; + r_cnjg(&q__1, &a[i__ + j * a_dim1]); + a[i__3].r = q__1.r, a[i__3].i = q__1.i; + } else if (sym) { + i__3 = j + i__ * a_dim1; + i__4 = i__ + j * a_dim1; + a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i; + } else if (tri) { + i__3 = j + i__ * a_dim1; + a[i__3].r = 0.f, a[i__3].i = 0.f; + } + } + } +/* L10: */ + } + if (her) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + r__1 = a[i__3].r; + q__1.r = r__1, q__1.i = 0.f; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + } + if (tri) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + q__1.r = a[i__3].r + 1.f, q__1.i = a[i__3].i + 0.f; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + } + if (unit) { + i__2 = j + j * a_dim1; + a[i__2].r = 1.f, a[i__2].i = 0.f; + } +/* L20: */ + } + +/* Store elements in array AS in data structure required by routine. */ + + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L30: */ + } + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L40: */ + } +/* L50: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen) + 2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + if (unit) { + iend = j - 1; + } else { + iend = j; + } + } else { + if (unit) { + ibeg = j + 1; + } else { + ibeg = j; + } + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L60: */ + } + i__2 = iend; + for (i__ = ibeg; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L70: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L80: */ + } + if (her) { + jj = j + (j - 1) * *lda; + i__2 = jj; + i__3 = jj; + r__1 = aa[i__3].r; + q__1.r = r__1, q__1.i = -1e10f; + aa[i__2].r = q__1.r, aa[i__2].i = q__1.i; + } +/* L90: */ + } + } + return 0; + +/* End of CMAKE. */ + +} /* cmake_ */ + +/* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer * + n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, + integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, + real *g, complex *cc, integer *ldcc, real *eps, real *err, logical * + fatal, integer *nout, logical *mv) +{ + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; + real r__1, r__2, r__3, r__4, r__5, r__6; + complex q__1, q__2, q__3, q__4; + + /* Local variables */ + real erri; + integer i__, j, k; + logical trana, tranb, ctrana, ctranb; + +/* Checks the results of the computational tests. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --ct; + --g; + cc_dim1 = *ldcc; + cc_offset = 1 + cc_dim1 * 1; + cc -= cc_offset; + + /* Function Body */ + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + 'C'; + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + 'C'; + ctrana = *(unsigned char *)transa == 'C'; + ctranb = *(unsigned char *)transb == 'C'; + +/* Compute expected result, one column at a time, in CT using data */ +/* in A, B and C. */ +/* Compute gauges in G. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + ct[i__3].r = 0.f, ct[i__3].i = 0.f; + g[i__] = 0.f; +/* L10: */ + } + if (! trana && ! tranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = k + j * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ + i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag( + &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[ + i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * + b_dim1]), abs(r__4))); +/* L20: */ + } +/* L30: */ + } + } else if (trana && ! tranb) { + if (ctrana) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + i__6 = k + j * b_dim1; + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6] + .r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[k + j * b_dim1]), abs(r__4))); +/* L40: */ + } +/* L50: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = k + j * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[k + j * b_dim1]), abs(r__4))); +/* L60: */ + } +/* L70: */ + } + } + } else if (! trana && tranb) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + r_cnjg(&q__3, &b[j + k * b_dim1]); + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__3.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[j + k * b_dim1]), abs(r__4))); +/* L80: */ + } +/* L90: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = j + k * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[j + k * b_dim1]), abs(r__4))); +/* L100: */ + } +/* L110: */ + } + } + } else if (trana && tranb) { + if (ctrana) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + r_cnjg(&q__4, &b[j + k * b_dim1]); + q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, + q__2.i = q__3.r * q__4.i + q__3.i * + q__4.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L120: */ + } +/* L130: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + i__6 = j + k * b_dim1; + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.i = q__3.r * b[i__6].i + q__3.i * b[ + i__6].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L140: */ + } +/* L150: */ + } + } + } else { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + r_cnjg(&q__3, &b[j + k * b_dim1]); + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__3.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L160: */ + } +/* L170: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = j + k * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ + i__7].i, q__2.i = a[i__6].r * b[i__7].i + + a[i__6].i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L180: */ + } +/* L190: */ + } + } + } + } + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = + alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; + i__5 = i__ + j * c_dim1; + q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = + beta->r * c__[i__5].i + beta->i * c__[i__5].r; + q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; + ct[i__3].r = q__1.r, ct[i__3].i = q__1.i; + i__3 = i__ + j * c_dim1; + g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), + abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + ( + r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, + abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs( + r__6))); +/* L200: */ + } + +/* Compute the error ratio for this result. */ + + *err = 0.f; + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__ + j * cc_dim1; + q__2.r = ct[i__3].r - cc[i__4].r, q__2.i = ct[i__3].i - cc[i__4] + .i; + q__1.r = q__2.r, q__1.i = q__2.i; + erri = ((r__1 = q__1.r, abs(r__1)) + (r__2 = r_imag(&q__1), abs( + r__2))) / *eps; + if (g[i__] != 0.f) { + erri /= g[i__]; + } + *err = f2cmax(*err,erri); + if (*err * sqrt(*eps) >= 1.f) { + goto L230; + } +/* L210: */ + } + +/* L220: */ + } + +/* If the loop completes, all results are at least half accurate. */ + goto L250; + +/* Report fatal error. */ + +L230: + *fatal = TRUE_; + printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n"); + printf(" EXPECTED RESULT COMPUTED RESULT\n"); + i__1 = *m; + for (i__ = 1; i__ <= i__1; ++i__) { + if (*mv) { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i); + } else { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i); + } +/* L240: */ + } + if (*n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d\n",j); + } + +L250: + return 0; + + +/* End of CMMCH. */ + +} /* cmmch_ */ + +logical lce_(complex *ri, complex *rj, integer *lr) +{ + /* System generated locals */ + integer i__1, i__2, i__3; + logical ret_val; + + /* Local variables */ + integer i__; + + +/* Tests if two arrays are identical. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --rj; + --ri; + + /* Function Body */ + i__1 = *lr; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) { + goto L20; + } +/* L10: */ + } + ret_val = TRUE_; + goto L30; +L20: + ret_val = FALSE_; +L30: + return ret_val; + +/* End of LCE. */ + +} /* lce_ */ + +logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa, + complex *as, integer *lda) +{ + /* System generated locals */ + integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; + logical ret_val; + + /* Local variables */ + integer ibeg, iend, i__, j; + logical upper; + + +/* Tests if selected elements in two arrays are equal. */ + +/* TYPE is 'ge' or 'he' or 'sy'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + as_dim1 = *lda; + as_offset = 1 + as_dim1 * 1; + as -= as_offset; + aa_dim1 = *lda; + aa_offset = 1 + aa_dim1 * 1; + aa -= aa_offset; + + /* Function Body */ + upper = *(unsigned char *)uplo == 'U'; + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L10: */ + } +/* L20: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + iend = j; + } else { + ibeg = j; + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L30: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L40: */ + } +/* L50: */ + } + } + +/* 60 CONTINUE */ + ret_val = TRUE_; + goto L80; +L70: + ret_val = FALSE_; +L80: + return ret_val; + +/* End of LCERES. */ + +} /* lceres_ */ + +/* Complex */ VOID cbeg_(complex * ret_val, logical *reset) +{ + /* System generated locals */ + real r__1, r__2; + complex q__1; + + /* Local variables */ + static integer i__, j, ic, mi, mj; + + +/* Generates complex numbers as pairs of random numbers uniformly */ +/* distributed between -0.5 and 0.5. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + if (*reset) { +/* Initialize local variables. */ + mi = 891; + mj = 457; + i__ = 7; + j = 7; + ic = 0; + *reset = FALSE_; + } + +/* The sequence of values of I or J is bounded between 1 and 999. */ +/* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */ +/* If initial I or J = 4 or 8, the period will be 25. */ +/* If initial I or J = 5, the period will be 10. */ +/* IC is used to break up the period by skipping 1 value of I or J */ +/* in 6. */ + + ++ic; +L10: + i__ *= mi; + j *= mj; + i__ -= i__ / 1000 * 1000; + j -= j / 1000 * 1000; + if (ic >= 5) { + ic = 0; + goto L10; + } + r__1 = (i__ - 500) / 1001.f; + r__2 = (j - 500) / 1001.f; + q__1.r = r__1, q__1.i = r__2; + ret_val->r = q__1.r, ret_val->i = q__1.i; + return ; + +/* End of CBEG. */ + +} /* cbeg_ */ + +real sdiff_(real *x, real *y) +{ + /* System generated locals */ + real ret_val; + + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + ret_val = *x - *y; + return ret_val; + +/* End of SDIFF. */ + +} /* sdiff_ */ + +/* Main program alias */ /*int cblat3_ () { MAIN__ (); return 0; }*/ diff --git a/ctest/c_dblat1c.c b/ctest/c_dblat1c.c index bf2f7a781..089dca4da 100644 --- a/ctest/c_dblat1c.c +++ b/ctest/c_dblat1c.c @@ -21,19 +21,6 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; -#ifdef _MSC_VER -static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} -static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} -static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} -static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} -#else -static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} -static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} -#endif -#define pCf(z) (*_pCf(z)) -#define pCd(z) (*_pCd(z)) typedef int logical; typedef short int shortlogical; typedef char logical1; @@ -242,124 +229,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif /* Common Block Declarations */ @@ -375,16 +244,16 @@ struct { static integer c__1 = 1; static doublereal c_b34 = 1.; -/* Main program */ int main() +/* Main program */ int main(void) { /* Initialized data */ static doublereal sfac = 9.765625e-4; /* Local variables */ - extern /* Subroutine */ int check0_(), check1_(), check2_(), check3_(); + extern /* Subroutine */ int check0_(doublereal*), check1_(doublereal*), check2_(doublereal*), check3_(doublereal*); static integer ic; - extern /* Subroutine */ int header_(); + extern /* Subroutine */ int header_(void); /* Test program for the DOUBLE PRECISION Level 1 CBLAS. */ /* Based upon the original CBLAS test routine together with: */ @@ -431,7 +300,7 @@ static doublereal c_b34 = 1.; } /* MAIN__ */ -/* Subroutine */ int header_() +/* Subroutine */ int header_(void) { /* Initialized data */ @@ -450,8 +319,7 @@ static doublereal c_b34 = 1.; } /* header_ */ -/* Subroutine */ int check0_(sfac) -doublereal *sfac; +/* Subroutine */ int check0_(doublereal* sfac) { /* Initialized data */ @@ -464,7 +332,7 @@ doublereal *sfac; /* Local variables */ static integer k; - extern /* Subroutine */ int drotgtest_(), stest1_(); + extern /* Subroutine */ int drotgtest_(doublereal*,doublereal*,doublereal*,doublereal*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static doublereal sa, sb, sc, ss; /* .. Parameters .. */ @@ -509,8 +377,7 @@ L40: return 0; } /* check0_ */ -/* Subroutine */ int check1_(sfac) -doublereal *sfac; +/* Subroutine */ int check1_(doublereal* sfac) { /* Initialized data */ @@ -535,14 +402,14 @@ doublereal *sfac; /* Local variables */ static integer i__; - extern doublereal dnrm2test_(); + extern doublereal dnrm2test_(integer*, doublereal*, integer*); static doublereal stemp[1], strue[8]; - extern /* Subroutine */ int stest_(), dscaltest_(); - extern doublereal dasumtest_(); - extern /* Subroutine */ int itest1_(), stest1_(); + extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(integer*,doublereal*,doublereal*,integer*); + extern doublereal dasumtest_(integer*,doublereal*,integer*); + extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static doublereal sx[8]; static integer np1; - extern integer idamaxtest_(); + extern integer idamaxtest_(integer*,doublereal*,integer*); static integer len; /* .. Parameters .. */ @@ -603,8 +470,7 @@ doublereal *sfac; return 0; } /* check1_ */ -/* Subroutine */ int check2_(sfac) -doublereal *sfac; +/* Subroutine */ int check2_(doublereal* sfac) { /* Initialized data */ @@ -649,10 +515,10 @@ doublereal *sfac; /* Local variables */ static integer lenx, leny; - extern doublereal ddottest_(); + extern doublereal ddottest_(integer*,doublereal*,integer*,doublereal*,integer*); static integer i__, j, ksize; - extern /* Subroutine */ int stest_(), dcopytest_(), dswaptest_(), - daxpytest_(), stest1_(); + extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(integer*,doublereal*,integer*,doublereal*,integer*), dswaptest_(integer*,doublereal*,integer*,doublereal*,integer*), + daxpytest_(integer*,doublereal*,doublereal*,integer*,doublereal*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static integer ki, kn, mx, my; static doublereal sx[7], sy[7], stx[7], sty[7]; @@ -733,8 +599,7 @@ doublereal *sfac; return 0; } /* check2_ */ -/* Subroutine */ int check3_(sfac) -doublereal *sfac; +/* Subroutine */ int check3_(doublereal* sfac) { /* Initialized data */ @@ -753,9 +618,9 @@ doublereal *sfac; ; /* Local variables */ - extern /* Subroutine */ int drottest_(); + extern /* Subroutine */ int drottest_(integer*,doublereal*,integer*,doublereal*,integer*,doublereal*,doublereal*); static integer i__, k, ksize; - extern /* Subroutine */int stest_(), drotmtest_(); + extern /* Subroutine */int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(integer*,doublereal*,integer*,doublereal*,integer*,doublereal*); static integer ki, kn; static doublereal dparam[5], sx[10], sy[10], stx[10], sty[10]; @@ -826,9 +691,7 @@ doublereal *sfac; return 0; } /* check3_ */ -/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac) -integer *len; -doublereal *scomp, *strue, *ssize, *sfac; +/* Subroutine */ int stest_(integer* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac) { /* System generated locals */ integer i__1; @@ -836,7 +699,7 @@ doublereal *scomp, *strue, *ssize, *sfac; /* Local variables */ static integer i__; - extern doublereal sdiff_(); + extern doublereal sdiff_(doublereal*,doublereal*); static doublereal sd; /* ********************************* STEST ************************** */ @@ -892,11 +755,10 @@ L40: } /* stest_ */ -/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac) -doublereal *scomp1, *strue1, *ssize, *sfac; +/* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac) { static doublereal scomp[1], strue[1]; - extern /* Subroutine */ int stest_(); + extern /* Subroutine */ int stest_(integer*, doublereal*, doublereal*, doublereal*, doublereal*); /* ************************* STEST1 ***************************** */ @@ -923,8 +785,7 @@ doublereal *scomp1, *strue1, *ssize, *sfac; return 0; } /* stest1_ */ -doublereal sdiff_(sa, sb) -doublereal *sa, *sb; +doublereal sdiff_(doublereal* sa, doublereal* sb) { /* System generated locals */ doublereal ret_val; @@ -938,8 +799,7 @@ doublereal *sa, *sb; return ret_val; } /* sdiff_ */ -/* Subroutine */ int itest1_(icomp, itrue) -integer *icomp, *itrue; +/* Subroutine */ int itest1_(integer* icomp, integer* itrue) { /* Local variables */ static integer id; @@ -1188,4 +1048,4 @@ doublereal *dparam; return 0; } /* drotm_ */ -#endif \ No newline at end of file +#endif diff --git a/ctest/c_dblat2c.c b/ctest/c_dblat2c.c index f94dbc1fe..547aa808e 100644 --- a/ctest/c_dblat2c.c +++ b/ctest/c_dblat2c.c @@ -242,129 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -395,7 +272,7 @@ static integer c_n1 = -1; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program */ int main() +/* Main program */ int main(void) { /* Initialized data */ @@ -413,17 +290,21 @@ static logical c_false = FALSE_; static logical same; static integer ninc, nbet, ntra; static logical rewi; - extern /* Subroutine */ int dchk1_(), dchk2_(), dchk3_(), dchk4_(), - dchk5_(), dchk6_(); + extern /* Subroutine */ int dchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk6_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); static doublereal a[4225] /* was [65][65] */, g[65]; static integer i__, j; - extern doublereal ddiff_(); + extern doublereal ddiff_(doublereal*, doublereal*); static integer n; static logical fatal; static doublereal x[65], y[65], z__[130]; static logical trace; static integer nidim; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static char snaps[32], trans[1]; static integer isnum; static logical ltest[16]; @@ -437,11 +318,11 @@ static logical c_false = FALSE_; static char snamet[12]; static doublereal thresh; static logical rorder; - extern /* Subroutine */ int cd2chke_(); + extern /* Subroutine */ void cd2chke_(char*, ftnlen); static integer layout; static logical ltestt, tsterr; static doublereal alf[7]; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static integer inc[7], nkb; static doublereal bet[7],eps,err; char tmpchar; @@ -977,21 +858,7 @@ L240: } /* MAIN__ */ -/* Subroutine */ int dchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1007,10 +874,10 @@ ftnlen sname_len; static integer incx, incy; static logical full, tran, null; static integer i__, m, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static integer incxs, incys; @@ -1018,13 +885,14 @@ ftnlen sname_len; static integer ia, ib, ic; static logical banded; static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns; - extern /* Subroutine */ int cdgbmv_(), cdgemv_(); - extern logical lderes_(); + extern /* Subroutine */ void cdgbmv_(integer*, char*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ void cdgemv_(integer*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static char ctrans[14]; static doublereal errmax, transl; static char transs[1]; static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, bls, err; static integer iku, kls, kus; @@ -1429,21 +1297,7 @@ L140: } /* dchk1_ */ -/* Subroutine */ int dchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1460,10 +1314,10 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, k, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -1474,12 +1328,13 @@ ftnlen sname_len; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, iy, ns, lx, ly; - extern logical lderes_(); - extern /* Subroutine */ int cdsbmv_(), cdspmv_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cdsbmv_(integer*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ void cdspmv_(integer*, char*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); static doublereal errmax, transl; - extern /* Subroutine */ int cdsymv_(); + extern /* Subroutine */ void cdsymv_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, bls, err; @@ -1882,17 +1737,7 @@ L130: } /* dchk2_ */ -/* Subroutine */ int dchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, xt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *xt, *g, *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* xt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1911,10 +1756,10 @@ ftnlen sname_len; static logical full, null; static char uplo[1], cdiag[14]; static integer i__, k, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static char diags[1]; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -1924,16 +1769,19 @@ ftnlen sname_len; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, ns, lx; - extern logical lderes_(); - extern /* Subroutine */ int cdtbmv_(), cdtbsv_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cdtbmv_(integer*, char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cdtbsv_(integer*, char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static char ctrans[14]; static doublereal errmax; - extern /* Subroutine */ int cdtpmv_(), cdtrmv_(); + extern /* Subroutine */ void cdtpmv_(integer*, char*, char*, char*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cdtrmv_(integer*, char*, char*, char*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static doublereal transl; - extern /* Subroutine */ int cdtpsv_(), cdtrsv_(); + extern /* Subroutine */ void cdtpsv_(integer*, char*, char*, char*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cdtrsv_(integer*, char*, char*, char*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static char transs[1]; static integer laa, icd, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static integer ict, icu; static doublereal err; @@ -2388,19 +2236,7 @@ L130: } /* dchk3_ */ -/* Subroutine */ int dchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; @@ -2411,17 +2247,18 @@ ftnlen sname_len; static integer incx, incy; static logical null; static integer i__, j, m, n; - extern /* Subroutine */ int dmake_(), cdger_(); + extern /* Subroutine */ void cdger_(integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, integer*); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static integer incxs, incys, ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax, transl; static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, err; @@ -2727,19 +2564,7 @@ L150: } /* dchk4_ */ -/* Subroutine */ int dchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2757,25 +2582,25 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, j, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; - extern /* Subroutine */ int cdspr_(); + extern /* Subroutine */ void cdspr_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, ftnlen); static logical reset; static char cuplo[14]; static integer incxs; - extern /* Subroutine */ int cdsyr_(); + extern /* Subroutine */ void cdsyr_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen); static logical upper; static char uplos[1]; static integer ia, ja, ic, nc, jj, lj, in; static logical packed; static integer ix, ns, lx; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax, transl; static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, err; @@ -3096,19 +2921,7 @@ L130: } /* dchk5_ */ -/* Subroutine */ int dchk6_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk6_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -3125,24 +2938,25 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, j, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha, w[2]; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs, incys; static logical upper; static char uplos[1]; - extern /* Subroutine */ int cdspr2_(), cdsyr2_(); + extern /* Subroutine */ void cdspr2_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, ftnlen); + extern /* Subroutine */ void cdsyr2_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen); static integer ia, ja, ic, nc, jj, lj, in; static logical packed; static integer ix, iy, ns, lx, ly; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax, transl; static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, err; /* Tests DSYR2 and DSPR2. */ @@ -3508,25 +3322,13 @@ L170: } /* dchk6_ */ -/* Subroutine */ int dmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, - ku, reset, transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -doublereal *a; -integer *nmax; -doublereal *aa; -integer *lda, *kl, *ku; -logical *reset; -doublereal *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int dmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublereal* a, integer* nmax, doublereal* aa, integer* lda, integer* kl, integer* ku, logical* reset, doublereal* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ - extern doublereal dbeg_(); + extern doublereal dbeg_(logical* ); static integer ibeg, iend, ioff; static logical unit; static integer i__, j; @@ -3752,28 +3554,14 @@ ftnlen diag_len; } /* dmake_ */ -/* Subroutine */ int dmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, - incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len) -char *trans; -integer *m, *n; -doublereal *alpha, *a; -integer *nmax; -doublereal *x; -integer *incx; -doublereal *beta, *y; -integer *incy; -doublereal *yt, *g, *yy, *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen trans_len; +/* Subroutine */ int dmvch_(char* trans, integer* m, integer* n, doublereal* alpha, doublereal* a, integer* nmax, doublereal* x, integer* incx, doublereal* beta, doublereal* y, integer* incy, doublereal* yt, doublereal* g, doublereal* yy, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; doublereal d__1; /* Builtin functions */ - double sqrt(); + double sqrt(double); /* Local variables */ static doublereal erri; @@ -3902,9 +3690,7 @@ L70: } /* dmvch_ */ -logical lde_(ri, rj, lr) -doublereal *ri, *rj; -integer *lr; +logical lde_(doublereal* ri, doublereal* rj, integer* lr) { /* System generated locals */ integer i__1; @@ -3949,13 +3735,7 @@ L30: } /* lde_ */ -logical lderes_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -doublereal *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lderes_(char* type__, char* uplo, integer* m, integer* n, doublereal* aa, doublereal* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2; @@ -4042,8 +3822,7 @@ L80: } /* lderes_ */ -doublereal dbeg_(reset) -logical *reset; +doublereal dbeg_(logical* reset) { /* System generated locals */ doublereal ret_val; @@ -4094,8 +3873,7 @@ L10: } /* dbeg_ */ -doublereal ddiff_(x, y) -doublereal *x, *y; +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; diff --git a/ctest/c_dblat3c.c b/ctest/c_dblat3c.c index 05d6b65b0..dc3d6f9e7 100644 --- a/ctest/c_dblat3c.c +++ b/ctest/c_dblat3c.c @@ -242,129 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -393,7 +270,7 @@ static logical c_true = TRUE_; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program MAIN__() */ int main() +/* Main program MAIN__() */ int main(void) { /* Initialized data */ @@ -403,25 +280,24 @@ static logical c_false = FALSE_; integer i__1, i__2, i__3; doublereal d__1; - /* Builtin functions */ - integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), - e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe(); - integer f_clos(); /* Local variables */ static integer nalf, idim[9]; static logical same; static integer nbet, ntra; static logical rewi; - extern /* Subroutine */ int dchk1_(), dchk2_(), dchk3_(), dchk4_(), - dchk5_(); + extern /* Subroutine */ int dchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); +/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* ab, doublereal* aa, doublereal* as, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, doublereal* w, integer* iorder, ftnlen sname_len); static doublereal c__[4225] /* was [65][65] */, g[65]; static integer i__, j; - extern doublereal ddiff_(); + extern doublereal ddiff_(doublereal*, doublereal*); static integer n; static logical fatal; static doublereal w[130]; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical trace; static integer nidim; static char snaps[32]; @@ -433,11 +309,11 @@ static logical c_false = FALSE_; static char snamet[12], transa[1], transb[1]; static doublereal thresh; static logical rorder; - extern /* Subroutine */ int cd3chke_(); + extern /* Subroutine */ void cd3chke_(char*, ftnlen); static integer layout; static logical ltestt, tsterr; static doublereal alf[7]; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal bet[7], eps, err; char tmpchar; @@ -907,21 +783,7 @@ L230: } /* MAIN__ */ -/* Subroutine */ int dchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *nmax; -doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -931,29 +793,27 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static doublereal beta; static integer ldas, ldbs, ldcs; static logical same, null; static integer i__, k, m, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13], trana, tranb; static integer nargs; static logical reset; - extern /* Subroutine */ void dprcn1_(); + extern /* Subroutine */ void dprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ia, ib, ma, mb, na, nb, nc, ik, im, in; - extern /* Subroutine */ int cdgemm_(); + extern /* Subroutine */ void cdgemm_(integer*, char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static integer ks, ms, ns; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static char tranas[1], tranbs[1], transa[1], transb[1]; static doublereal errmax; static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, bls, err; /* Tests DGEMM. */ @@ -1283,23 +1143,8 @@ L130: } /* dchk1_ */ -/* Subroutine */ void dprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k, - alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *transa, *transb; -integer *m, *n, *k; -doublereal *alpha; -integer *lda, *ldb; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ void dprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char crc[14], cta[14], ctb[14]; @@ -1328,21 +1173,7 @@ ftnlen transb_len; } /* dprcn1_ */ -/* Subroutine */ int dchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *nmax; -doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1353,8 +1184,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static doublereal beta; @@ -1364,21 +1193,21 @@ ftnlen sname_len; static logical left, null; static char uplo[1]; static integer i__, m, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13]; static char sides[1]; static integer nargs; static logical reset; static char uplos[1]; - extern /* Subroutine */ void dprcn2_(); + extern /* Subroutine */ void dprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ia, ib, na, nc, im, in, ms, ns; - extern logical lderes_(); - extern /* Subroutine */ int cdsymm_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cdsymm_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax; static integer laa, lbb, lda, lcc, ldb, ldc; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static integer ics; static doublereal als, bls; static integer icu; @@ -1692,23 +1521,8 @@ L120: } /* dchk2_ */ -/* Subroutine */ void dprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha, - lda, ldb, beta, ldc, sname_len, side_len, uplo_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo; -integer *m, *n; -doublereal *alpha; -integer *lda, *ldb; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; +/* Subroutine */ void dprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char cs[14], cu[14], crc[14]; @@ -1733,19 +1547,7 @@ ftnlen uplo_len; } /* dprcn2_ */ -/* Subroutine */ int dchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__, - iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nmax; -doublereal *a, *aa, *as, *b, *bb, *bs, *ct, *g, *c__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* ct, doublereal* g, doublereal* c__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1766,24 +1568,24 @@ ftnlen sname_len; static logical left, null; static char uplo[1]; static integer i__, j, m, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; static char diags[1]; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13]; static char sides[1]; static integer nargs; static logical reset; static char uplos[1]; - extern /* Subroutine */ void dprcn3_(); + extern /* Subroutine */ void dprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen); static integer ia, na, nc, im, in, ms, ns; - extern logical lderes_(); - extern /* Subroutine */ int cdtrmm_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cdtrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); static char tranas[1], transa[1]; - extern /* Subroutine */ int cdtrsm_(); + extern /* Subroutine */ void cdtrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); static doublereal errmax; static integer laa, icd, lbb, lda, ldb; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static integer ics; static doublereal als; static integer ict, icu; @@ -2165,24 +1967,8 @@ L160: } /* dchk3_ */ -/* Subroutine */ void dprcn3_(nout, nc, sname, iorder, side, uplo, transa, - diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, - transa_len, diag_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo, *transa, *diag; -integer *m, *n; -doublereal *alpha; -integer *lda, *ldb; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; -ftnlen transa_len; -ftnlen diag_len; +/* Subroutine */ void dprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublereal* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cd[14], cs[14], cu[14], crc[14]; @@ -2219,21 +2005,7 @@ ftnlen diag_len; } /* dprcn3_ */ -/* Subroutine */ int dchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *nmax; -doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2244,8 +2016,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static doublereal beta; @@ -2255,23 +2025,23 @@ ftnlen sname_len; static logical tran, null; static char uplo[1]; static integer i__, j, k, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13]; static integer nargs; static logical reset; static char trans[1]; static logical upper; static char uplos[1]; - extern /* Subroutine */ void dprcn4_(); + extern /* Subroutine */ void dprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax; - extern /* Subroutine */ int cdsyrk_(); + extern /* Subroutine */ void cdsyrk_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static char transs[1]; static integer laa, lda, lcc, ldc; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als; static integer ict, icu; static doublereal err; @@ -2586,23 +2356,8 @@ L130: } /* dchk4_ */ -/* Subroutine */ void dprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublereal *alpha; -integer *lda; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ void dprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -2629,21 +2384,7 @@ ftnlen transa_len; } /* dprcn4_ */ -/* Subroutine */ int dchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, - c__, cc, cs, ct, g, w, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *nmax; -doublereal *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct, *g, *w; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* ab, doublereal* aa, doublereal* as, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, doublereal* w, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2653,8 +2394,6 @@ ftnlen sname_len; /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static integer jjab; @@ -2665,23 +2404,23 @@ ftnlen sname_len; static logical tran, null; static char uplo[1]; static integer i__, j, k, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13]; static integer nargs; static logical reset; static char trans[1]; static logical upper; static char uplos[1]; - extern /* Subroutine */ void dprcn5_(); + extern /* Subroutine */ void dprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax; static char transs[1]; static integer laa, lbb, lda, lcc, ldb, ldc; - extern logical lde_(); - extern /* Subroutine */ int cdsyr2k_(); + extern logical lde_(doublereal*, doublereal*, integer*); + extern /* Subroutine */ void cdsyr2k_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal als; static integer ict, icu; static doublereal err; @@ -3048,23 +2787,8 @@ L160: } /* dchk5_ */ -/* Subroutine */ void dprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublereal *alpha; -integer *lda, *ldb; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ void dprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -3091,25 +2815,13 @@ ftnlen transa_len; } /* dprcn5_ */ -/* Subroutine */ int dmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset, - transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -doublereal *a; -integer *nmax; -doublereal *aa; -integer *lda; -logical *reset; -doublereal *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int dmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublereal* a, integer* nmax, doublereal* aa, integer* lda, logical* reset, doublereal* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern doublereal dbeg_(); + extern doublereal dbeg_(logical*); static integer ibeg, iend; static logical unit; static integer i__, j; @@ -3241,25 +2953,7 @@ ftnlen diag_len; } /* dmake_ */ -/* Subroutine */ int dmmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, - beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, - transa_len, transb_len) -char *transa, *transb; -integer *m, *n, *kk; -doublereal *alpha, *a; -integer *lda; -doublereal *b; -integer *ldb; -doublereal *beta, *c__; -integer *ldc; -doublereal *ct, *g, *cc; -integer *ldcc; -doublereal *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ int dmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublereal* alpha, doublereal* a, integer* lda, doublereal* b, integer* ldb, doublereal* beta, doublereal* c__, integer* ldc, doublereal* ct, doublereal* g, doublereal* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, @@ -3267,8 +2961,7 @@ ftnlen transb_len; doublereal d__1, d__2; /* Builtin functions */ - double sqrt(); - integer s_wsfe(), e_wsfe(), do_fio(); + double sqrt(double); /* Local variables */ static doublereal erri; @@ -3432,9 +3125,7 @@ L150: } /* dmmch_ */ -logical lde_(ri, rj, lr) -doublereal *ri, *rj; -integer *lr; +logical lde_(doublereal* ri, doublereal* rj, integer* lr) { /* System generated locals */ integer i__1; @@ -3481,13 +3172,7 @@ L30: } /* lde_ */ -logical lderes_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -doublereal *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lderes_(char* type__, char* uplo, integer* m, integer* n, doublereal* aa, doublereal* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2; @@ -3576,8 +3261,7 @@ L80: } /* lderes_ */ -doublereal dbeg_(reset) -logical *reset; +doublereal dbeg_(logical* reset) { /* System generated locals */ doublereal ret_val; @@ -3629,8 +3313,7 @@ L10: } /* dbeg_ */ -doublereal ddiff_(x, y) -doublereal *x, *y; +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c index 57e4707a9..7a81e04c1 100644 --- a/ctest/c_sblat1c.c +++ b/ctest/c_sblat1c.c @@ -21,19 +21,6 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; -#ifdef _MSC_VER -static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} -static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} -static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} -static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} -#else -static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} -static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} -#endif -#define pCf(z) (*_pCf(z)) -#define pCd(z) (*_pCd(z)) typedef int logical; typedef short int shortlogical; typedef char logical1; @@ -242,250 +229,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -#if 0 -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -#if 0 -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -393,7 +270,7 @@ static logical c_true = TRUE_; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program MAIN__() */ int main() +/* Main program MAIN__() */ int main(void) { /* Initialized data */ @@ -402,26 +279,25 @@ static logical c_false = FALSE_; /* System generated locals */ integer i__1, i__2, i__3; real r__1; - /* Builtin functions */ - integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), - e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe(); - integer f_clos(); /* Local variables */ static integer nalf, idim[9]; static logical same; static integer nbet, ntra; static logical rewi; - extern /* Subroutine */ int schk1_(), schk2_(), schk3_(), schk4_(), - schk5_(); + extern /* Subroutine */ int schk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); + extern /* Subroutine */ int schk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); + extern /* Subroutine */ int schk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); + extern /* Subroutine */ int schk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); + extern /* Subroutine */ int schk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); static real c__[4225] /* was [65][65] */, g[65]; static integer i__, j, n; static logical fatal; static real w[130]; - extern doublereal sdiff_(); + extern doublereal sdiff_(real*, real*); static logical trace; static integer nidim; - extern /* Subroutine */ int smmch_(); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static char snaps[32]; static integer isnum; static logical ltest[6]; @@ -433,9 +309,9 @@ static logical c_false = FALSE_; static logical rorder; static integer layout; static logical ltestt, tsterr; - extern /* Subroutine */ int cs3chke_(); + extern /* Subroutine */ void cs3chke_(char*, ftnlen); static real alf[7], bet[7]; - extern logical lse_(); + extern logical lse_(real*, real*, integer*); static real eps, err; char tmpchar; @@ -899,21 +775,7 @@ L230: } /* MAIN__ */ -/* Subroutine */ int schk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nbet; -real *bet; -integer *nmax; -real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -923,8 +785,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static real beta; @@ -936,18 +796,17 @@ ftnlen sname_len; static logical trana, tranb; static integer nargs; static logical reset; - extern /* Subroutine */ void sprcn1_(); - extern /* Subroutine */ int smake_(); - extern /* Subroutine */ int smmch_(); + extern /* Subroutine */ void sprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; - extern /* Subroutine */ int csgemm_(); + extern /* Subroutine */ void csgemm_(integer*, char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen); static char tranas[1], tranbs[1], transa[1], transb[1]; static real errmax; - extern logical lseres_(); - extern logical lse_(); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern logical lse_(real*, real*, integer*); static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; static real als, bls; - extern logical lse_(); static real err; /* Tests SGEMM. */ @@ -1278,23 +1137,8 @@ L130: -/* Subroutine */ void sprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k, - alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *transa, *transb; -integer *m, *n, *k; -real *alpha; -integer *lda, *ldb; -real *beta; -integer *ldc; -ftnlen sname_len; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ void sprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char crc[14], cta[14], ctb[14]; @@ -1324,21 +1168,7 @@ ftnlen transb_len; } /* sprcn1_ */ -/* Subroutine */ int schk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nbet; -real *bet; -integer *nmax; -real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1349,8 +1179,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static real beta; @@ -1368,15 +1196,15 @@ ftnlen sname_len; static char uplos[1]; static integer ia, ib, na, nc, im, in, ms, ns; static real errmax; - extern logical lseres_(); - extern /* Subroutine */ int cssymm_(); - extern void sprcn2_(); - extern int smake_(); - extern int smmch_(); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cssymm_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern void sprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static integer laa, lbb, lda, lcc, ldb, ldc, ics; static real als, bls; static integer icu; - extern logical lse_(); + extern logical lse_(real*, real*, integer*); static real err; /* Tests SSYMM. */ @@ -1685,23 +1513,8 @@ L120: } /* schk2_ */ -/* Subroutine */ void sprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha, - lda, ldb, beta, ldc, sname_len, side_len, uplo_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo; -integer *m, *n; -real *alpha; -integer *lda, *ldb; -real *beta; -integer *ldc; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; +/* Subroutine */ void sprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char cs[14], cu[14], crc[14]; @@ -1726,19 +1539,7 @@ ftnlen uplo_len; } /* sprcn2_ */ -/* Subroutine */ int schk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__, - iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nmax; -real *a, *aa, *as, *b, *bb, *bs, *ct, *g, *c__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* ct, real* g, real* c__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1751,8 +1552,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static char diag[1]; @@ -1769,18 +1568,19 @@ ftnlen sname_len; static integer nargs; static logical reset; static char uplos[1]; - extern /* Subroutine */ void sprcn3_(); + extern /* Subroutine */ void sprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, real*, integer*, integer*, ftnlen , ftnlen, ftnlen, ftnlen, ftnlen); static integer ia, na, nc, im, in, ms, ns; static char tranas[1], transa[1]; static real errmax; - extern int smake_(); - extern int smmch_(); - extern logical lseres_(); - extern /* Subroutine */ int cstrmm_(), cstrsm_(); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cstrmm_(integer*, char*, char*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cstrsm_(integer*, char*, char*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); static integer laa, icd, lbb, lda, ldb, ics; static real als; static integer ict, icu; - extern logical lse_(); + extern logical lse_(real*, real*, integer*); static real err; /* Tests STRMM and STRSM. */ @@ -2155,24 +1955,8 @@ L160: } /* schk3_ */ -/* Subroutine */ void sprcn3_(nout, nc, sname, iorder, side, uplo, transa, - diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, - transa_len, diag_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo, *transa, *diag; -integer *m, *n; -real *alpha; -integer *lda, *ldb; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; -ftnlen transa_len; -ftnlen diag_len; +/* Subroutine */ void sprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, real* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cd[14], cs[14], cu[14], crc[14]; @@ -2210,21 +1994,7 @@ ftnlen diag_len; } /* sprcn3_ */ -/* Subroutine */ int schk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nbet; -real *bet; -integer *nmax; -real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2235,8 +2005,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static real beta; @@ -2253,18 +2021,18 @@ ftnlen sname_len; static char trans[1]; static logical upper; static char uplos[1]; - extern /* Subroutine */ void sprcn4_(); - extern /* Subroutine */ int smake_(); - extern /* Subroutine */ int smmch_(); + extern /* Subroutine */ void sprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; static real errmax; - extern logical lseres_(); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); static char transs[1]; - extern /* Subroutine */ int cssyrk_(); + extern /* Subroutine */ void cssyrk_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen); static integer laa, lda, lcc, ldc; static real als; static integer ict, icu; - extern logical lse_(); + extern logical lse_(real*, real*, integer*); static real err; /* Tests SSYRK. */ @@ -2575,23 +2343,8 @@ L130: } /* schk4_ */ -/* Subroutine */ void sprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -real *alpha; -integer *lda; -real *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ void sprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, real* alpha, integer* lda, real* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -2619,21 +2372,7 @@ ftnlen transa_len; } /* sprcn4_ */ -/* Subroutine */ int schk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, - c__, cc, cs, ct, g, w, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nbet; -real *bet; -integer *nmax; -real *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct, *g, *w; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* ab, real* aa, real* as, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, real* w, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2643,8 +2382,6 @@ ftnlen sname_len; /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static integer jjab; @@ -2663,18 +2400,18 @@ ftnlen sname_len; static logical upper; static char uplos[1]; static integer ia, ib; - extern /* Subroutine */ void sprcn5_(); + extern /* Subroutine */ void sprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen); static integer jc, ma, na, nc, ik, in, jj, lj, ks, ns; static real errmax; - extern logical lseres_(); - extern int smake_(); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); static char transs[1]; static integer laa, lbb, lda, lcc, ldb, ldc; static real als; static integer ict, icu; - extern /* Subroutine */ int cssyr2k_(); - extern logical lse_(); - extern int smmch_(); + extern /* Subroutine */ void cssyr2k_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern logical lse_(real*, real*, integer*); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static real err; /* Tests SSYR2K. */ @@ -3037,23 +2774,8 @@ L160: } /* schk5_ */ -/* Subroutine */ void sprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -real *alpha; -integer *lda, *ldb; -real *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ void sprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -3081,19 +2803,7 @@ ftnlen transa_len; } /* sprcn5_ */ -/* Subroutine */ int smake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset, - transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -real *a; -integer *nmax; -real *aa; -integer *lda; -logical *reset; -real *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int smake_(char* type__, char* uplo, char* diag, integer* m, integer* n, real* a, integer* nmax, real* aa, integer* lda, logical* reset, real* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; @@ -3102,7 +2812,7 @@ ftnlen diag_len; /* Local variables */ static integer ibeg, iend; - extern doublereal sbeg_(); + extern doublereal sbeg_(logical*); static logical unit; static integer i__, j; static logical lower, upper, gen, tri, sym; @@ -3233,25 +2943,7 @@ ftnlen diag_len; } /* smake_ */ -/* Subroutine */ int smmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, - beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, - transa_len, transb_len) -char *transa, *transb; -integer *m, *n, *kk; -real *alpha, *a; -integer *lda; -real *b; -integer *ldb; -real *beta, *c__; -integer *ldc; -real *ct, *g, *cc; -integer *ldcc; -real *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ int smmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, real* alpha, real* a, integer* lda, real* b, integer* ldb, real* beta, real* c__, integer* ldc, real* ct, real* g, real* cc, integer* ldcc, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) { /* System generated locals */ @@ -3260,8 +2952,7 @@ ftnlen transb_len; real r__1, r__2; /* Builtin functions */ - double sqrt(); - integer s_wsfe(), e_wsfe(), do_fio(); + double sqrt(double); /* Local variables */ static real erri; @@ -3426,9 +3117,7 @@ L150: } /* smmch_ */ -logical lse_(ri, rj, lr) -real *ri, *rj; -integer *lr; +logical lse_(real* ri, real* rj, integer* lr) { /* System generated locals */ integer i__1; @@ -3475,13 +3164,7 @@ L30: } /* lse_ */ -logical lseres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -real *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2; @@ -3572,8 +3255,7 @@ L80: } /* lseres_ */ -doublereal sbeg_(reset) -logical *reset; +doublereal sbeg_(logical* reset) { /* System generated locals */ real ret_val; @@ -3625,8 +3307,7 @@ L10: } /* sbeg_ */ -doublereal sdiff_(x, y) -real *x, *y; +doublereal sdiff_(real* x, real* y) { /* System generated locals */ real ret_val; diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c index d5b080633..f7c0515fc 100644 --- a/ctest/c_zblat1c.c +++ b/ctest/c_zblat1c.c @@ -242,250 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -#if 0 -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -396,7 +273,7 @@ static integer c_n1 = -1; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program */ int main() +/* Main program */ int main(void) { /* Initialized data */ @@ -414,19 +291,23 @@ static logical c_false = FALSE_; static logical same; static integer ninc, nbet, ntra; static logical rewi; - extern /* Subroutine */ int zchk1_(), zchk2_(), zchk3_(), zchk4_(), - zchk5_(), zchk6_(); + extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk6_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); static doublecomplex a[4225] /* was [65][65] */; static doublereal g[65]; static integer i__, j; - extern doublereal ddiff_(); + extern doublereal ddiff_(doublereal*, doublereal*); static integer n; static logical fatal; static doublecomplex x[65], y[65], z__[130]; static logical trace; static integer nidim; static char snaps[32], trans[1]; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer isnum; static logical ltest[17]; static doublecomplex aa[4225]; @@ -441,12 +322,12 @@ static logical c_false = FALSE_; static logical rorder; static integer layout; static logical ltestt, tsterr; - extern /* Subroutine */ int cz2chke_(); + extern /* Subroutine */ void cz2chke_(char*, ftnlen); static doublecomplex alf[7]; static integer inc[7], nkb; static doublecomplex bet[7]; static doublereal eps, err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); char tmpchar; /* Test program for the DOUBLE PRECISION COMPLEX Level 2 Blas. */ @@ -984,22 +865,7 @@ L240: } /* MAIN__ */ -/* Subroutine */ int zchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1018,27 +884,27 @@ ftnlen sname_len; static integer i__, m, n; static doublecomplex alpha; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static integer incxs, incys; static char trans[1]; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer ia, ib, ic; static logical banded; static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns; - extern /* Subroutine */ int czgbmv_(); + extern /* Subroutine */ void czgbmv_(integer*, char*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); static char ctrans[14]; - extern /* Subroutine */ int czgemv_(); + extern /* Subroutine */ void czgemv_(integer*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); static doublereal errmax; static doublecomplex transl; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static char transs[1]; static integer laa, lda; static doublecomplex als, bls; static doublereal err; static integer iku, kls; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); static integer kus; @@ -1451,22 +1317,7 @@ L140: } /* zchk1_ */ -/* Subroutine */ int zchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1486,27 +1337,28 @@ ftnlen sname_len; static integer i__, k, n; static doublecomplex alpha; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs, incys; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static char uplos[1]; static integer ia, ib, ic; static logical banded; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, iy, ns, lx, ly; - extern /* Subroutine */ int czhbmv_(), czhemv_(); + extern /* Subroutine */ void czhbmv_(integer*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ void czhemv_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); static doublereal errmax; static doublecomplex transl; - extern logical lzeres_(); - extern /* Subroutine */ int czhpmv_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void czhpmv_(integer*, char*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); static integer laa, lda; static doublecomplex als, bls; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests CHEMV, CHBMV and CHPMV. */ @@ -1909,19 +1761,7 @@ L130: } /* zchk2_ */ -/* Subroutine */ int zchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, xt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *xt; -doublereal *g; -doublecomplex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* xt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1942,13 +1782,13 @@ ftnlen sname_len; static integer i__, k, n; static char diags[1]; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs; static char trans[1]; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static char uplos[1]; static logical banded; static integer nc, ik, in; @@ -1957,14 +1797,17 @@ ftnlen sname_len; static char ctrans[14]; static doublereal errmax; static doublecomplex transl; - extern logical lzeres_(); - extern /* Subroutine */ int cztbmv_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cztbmv_(integer*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static char transs[1]; - extern /* Subroutine */ int cztbsv_(), cztpmv_(), cztrmv_(), cztpsv_(), - cztrsv_(); + extern /* Subroutine */ void cztbsv_(integer*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztpmv_(integer*, char*, char*, char*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztpsv_(integer*, char*, char*, char*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrmv_(integer*, char*, char*, char*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrsv_(integer*, char*, char*, char*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static integer laa, icd, lda, ict, icu; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); @@ -2422,21 +2265,7 @@ L130: } /* zchk3_ */ -/* Subroutine */ int zchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -doublecomplex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; @@ -2450,21 +2279,21 @@ ftnlen sname_len; static integer i__, j, m, n; static doublecomplex alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static integer incxs, incys; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly; - extern /* Subroutine */ int czgerc_(); + extern /* Subroutine */ void czgerc_(integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*); static doublereal errmax; - extern /* Subroutine */ int czgeru_(); + extern /* Subroutine */ void czgeru_(integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*); static doublecomplex transl; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lda; static doublecomplex als; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); @@ -2793,21 +2622,7 @@ L150: } /* zchk4_ */ -/* Subroutine */ int zchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -doublecomplex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2827,13 +2642,14 @@ ftnlen sname_len; static integer i__, j, n; static doublecomplex alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int czher_(); + extern /* Subroutine */ void czher_(integer*, char*, integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen); static logical reset; static char cuplo[14]; static integer incxs; - extern /* Subroutine */ int czhpr_(), zmvch_(); + extern /* Subroutine */ void czhpr_(integer*, char*, integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, ftnlen); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static logical upper; static char uplos[1]; static integer ia, ja, ic, nc, jj, lj, in; @@ -2841,10 +2657,10 @@ ftnlen sname_len; static integer ix, ns, lx; static doublereal ralpha, errmax; static doublecomplex transl; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lda; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHER and ZHPR. */ @@ -3167,21 +2983,7 @@ L130: } /* zchk5_ */ -/* Subroutine */ int zchk6_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -doublecomplex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk6_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -3201,25 +3003,26 @@ ftnlen sname_len; static integer i__, j, n; static doublecomplex alpha, w[2]; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs, incys; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static logical upper; static char uplos[1]; - extern /* Subroutine */ int czher2_(), czhpr2_(); + extern /* Subroutine */ void czher2_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ void czhpr2_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, ftnlen); static integer ia, ja, ic, nc, jj, lj, in; static logical packed; static integer ix, iy, ns, lx, ly; static doublereal errmax; static doublecomplex transl; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lda; static doublecomplex als; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHER2 and ZHPR2. */ @@ -3604,24 +3407,7 @@ L170: } /* zchk6_ */ -/* Subroutine */ int zmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, - incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len) -char *trans; -integer *m, *n; -doublecomplex *alpha, *a; -integer *nmax; -doublecomplex *x; -integer *incx; -doublecomplex *beta, *y; -integer *incy; -doublecomplex *yt; -doublereal *g; -doublecomplex *yy; -doublereal *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen trans_len; +/* Subroutine */ int zmvch_(char* trans, integer* m, integer* n, doublecomplex* alpha, doublecomplex* a, integer* nmax, doublecomplex* x, integer* incx, doublecomplex* beta, doublecomplex* y, integer* incy, doublecomplex* yt, doublereal* g, doublecomplex* yy, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len) { /* System generated locals */ @@ -3819,9 +3605,7 @@ L80: } /* zmvch_ */ -logical lze_(ri, rj, lr) -doublecomplex *ri, *rj; -integer *lr; +logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr) { /* System generated locals */ integer i__1, i__2, i__3; @@ -3868,13 +3652,7 @@ L30: } /* lze_ */ -logical lzeres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -doublecomplex *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex* aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; @@ -3967,9 +3745,7 @@ L80: } /* lzeres_ */ -/* Double Complex */ VOID zbeg_( ret_val, reset) -doublecomplex * ret_val; -logical *reset; +/* Double Complex */ VOID zbeg_( doublecomplex* ret_val, logical* reset) { /* System generated locals */ doublereal d__1, d__2; @@ -4030,8 +3806,7 @@ L10: } /* zbeg_ */ -doublereal ddiff_(x, y) -doublereal *x, *y; +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; @@ -4051,19 +3826,7 @@ doublereal *x, *y; } /* ddiff_ */ -/* Subroutine */ int zmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, - ku, reset, transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -doublecomplex *a; -integer *nmax; -doublecomplex *aa; -integer *lda, *kl, *ku; -logical *reset; -doublecomplex *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, integer* kl, integer* ku, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; @@ -4072,7 +3835,7 @@ ftnlen diag_len; /* Local variables */ static integer ibeg, iend, ioff; - extern /* Double Complex */ VOID zbeg_(); + extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*); static logical unit; static integer i__, j; static logical lower; diff --git a/ctest/c_zblat3c.c b/ctest/c_zblat3c.c index eca2c3ff6..6025c0052 100644 --- a/ctest/c_zblat3c.c +++ b/ctest/c_zblat3c.c @@ -22,14 +22,11 @@ typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; #ifdef _MSC_VER -static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} -static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} #else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} #endif #define pCf(z) (*_pCf(z)) @@ -242,124 +239,7 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif + /* Common Block Declarations */ @@ -388,7 +268,7 @@ static logical c_true = TRUE_; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program MAIN__() */ int main() +/* Main program MAIN__() */ int main(void) { /* Initialized data */ @@ -400,26 +280,29 @@ static logical c_false = FALSE_; doublereal d__1; /* Builtin functions */ - integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), - e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe(); + integer s_rsle(void), do_lio(void), e_rsle(void), f_open(void), s_wsfe(void), do_fio(void), + e_wsfe(void), s_wsle(void), e_wsle(void), s_rsfe(void), e_rsfe(void); /* Local variables */ static integer nalf, idim[9]; static logical same; static integer nbet, ntra; static logical rewi; - extern /* Subroutine */ int zchk1_(), zchk2_(), zchk3_(), zchk4_(), - zchk5_(); + extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); static doublecomplex c__[4225] /* was [65][65] */; static doublereal g[65]; static integer i__, j; - extern doublereal ddiff_(); + extern doublereal ddiff_(doublereal*, doublereal*); static integer n; static logical fatal; static doublecomplex w[130]; static logical trace; static integer nidim; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static char snaps[32]; static integer isnum; static logical ltest[9]; @@ -431,10 +314,10 @@ static logical c_false = FALSE_; static logical rorder; static integer layout; static logical ltestt, tsterr; - extern /* Subroutine */ int cz3chke_(); + extern /* Subroutine */ int cz3chke_(char*, ftnlen); static doublecomplex alf[7], bet[7]; static doublereal eps, err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); char tmpchar; /* Test program for the COMPLEX*16 Level 3 Blas. */ @@ -924,22 +807,7 @@ L230: } /* MAIN__ */ -/* Subroutine */ int zchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *nmax; -doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -956,21 +824,21 @@ ftnlen sname_len; static integer i__, k, m, n; static doublecomplex alpha; static logical isame[13], trana, tranb; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical reset; static integer ia, ib; - extern /* Subroutine */ int zprcn1_(); + extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; - extern /* Subroutine */ int czgemm_(); + extern /* Subroutine */ void czgemm_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static char tranas[1], tranbs[1], transa[1], transb[1]; static doublereal errmax; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; static doublecomplex als, bls; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZGEMM. */ @@ -1313,20 +1181,7 @@ L130: } /* zchk1_ */ -/* Subroutine */ int zprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k, - alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *transa, *transb; -integer *m, *n, *k; -doublecomplex *alpha; -integer *lda, *ldb; -doublecomplex *beta; -integer *ldc; -ftnlen sname_len; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) { /* Local variables */ static char crc[14], cta[14], ctb[14]; @@ -1357,22 +1212,7 @@ return 0; } /* zprcn1_ */ -/* Subroutine */ int zchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *nmax; -doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1394,23 +1234,23 @@ ftnlen sname_len; static doublecomplex alpha; static logical isame[13]; static char sides[1]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical reset; static char uplos[1]; static integer ia, ib; - extern /* Subroutine */ int zprcn2_(); + extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static integer na, nc, im, in, ms, ns; - extern /* Subroutine */ int czhemm_(); + extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static doublereal errmax; - extern logical lzeres_(); - extern /* Subroutine */ int czsymm_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lbb, lda, lcc, ldb, ldc, ics; static doublecomplex als, bls; static integer icu; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHEMM and ZSYMM. */ @@ -1737,20 +1577,7 @@ L120: } /* zchk2_ */ -/* Subroutine */ int zprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha, - lda, ldb, beta, ldc, sname_len, side_len, uplo_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo; -integer *m, *n; -doublecomplex *alpha; -integer *lda, *ldb; -doublecomplex *beta; -integer *ldc; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; +/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) { /* Local variables */ static char cs[14], cu[14], crc[14]; @@ -1777,21 +1604,7 @@ return 0; } /* zprcn2_ */ -/* Subroutine */ int zchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__, - iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nmax; -doublecomplex *a, *aa, *as, *b, *bb, *bs, *ct; -doublereal *g; -doublecomplex *c__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1817,23 +1630,24 @@ ftnlen sname_len; static char diags[1]; static logical isame[13]; static char sides[1]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical reset; static char uplos[1]; static integer ia, na; - extern /* Subroutine */ int zprcn3_(); + extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen); static integer nc, im, in, ms, ns; static char tranas[1], transa[1]; static doublereal errmax; - extern logical lzeres_(); - extern /* Subroutine */ int cztrmm_(), cztrsm_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); static integer laa, icd, lbb, lda, ldb, ics; static doublecomplex als; static integer ict, icu; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZTRMM and ZTRSM. */ @@ -2227,21 +2041,7 @@ L160: } /* zchk3_ */ -/* Subroutine */ int zprcn3_(nout, nc, sname, iorder, side, uplo, transa, - diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, - transa_len, diag_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo, *transa, *diag; -integer *m, *n; -doublecomplex *alpha; -integer *lda, *ldb; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; -ftnlen transa_len; -ftnlen diag_len; +/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) { /* Local variables */ @@ -2281,22 +2081,7 @@ return 0; } /* zprcn3_ */ -/* Subroutine */ int zchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *nmax; -doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2320,30 +2105,30 @@ ftnlen sname_len; static doublecomplex alpha; static doublereal rbeta; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static doublereal rbets; static logical reset; static char trans[1]; static logical upper; static char uplos[1]; static integer ia, ib, jc, ma, na; - extern /* Subroutine */ int zprcn4_(); + extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static integer nc; - extern /* Subroutine */ int zprcn6_(); + extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ik, in, jj, lj, ks, ns; static doublereal ralpha; - extern /* Subroutine */ int czherk_(); + extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); static doublereal errmax; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static char transs[1], transt[1]; - extern /* Subroutine */ int czsyrk_(); + extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lda, lcc, ldc; static doublecomplex als; static integer ict, icu; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHERK and ZSYRK. */ @@ -2732,20 +2517,7 @@ L130: } /* zchk4_ */ -/* Subroutine */ int zprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublecomplex *alpha; -integer *lda; -doublecomplex *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -2775,20 +2547,7 @@ return 0; -/* Subroutine */ int zprcn6_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublereal *alpha; -integer *lda; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { /* Local variables */ @@ -2818,23 +2577,7 @@ return 0; } /* zprcn6_ */ -/* Subroutine */ int zchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, - c__, cc, cs, ct, g, w, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *nmax; -doublecomplex *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct; -doublereal *g; -doublecomplex *w; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2857,27 +2600,28 @@ ftnlen sname_len; static doublecomplex alpha; static doublereal rbeta; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static doublereal rbets; static logical reset; static char trans[1]; static logical upper; static char uplos[1]; static integer ia, ib, jc, ma, na, nc; - extern /* Subroutine */ int zprcn5_(), zprcn7_(); + extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ik, in, jj, lj, ks, ns; static doublereal errmax; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static char transs[1], transt[1]; - extern /* Subroutine */ int czher2k_(); + extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lbb, lda, lcc, ldb, ldc; static doublecomplex als; static integer ict, icu; - extern /* Subroutine */ int czsyr2k_(); + extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHER2K and ZSYR2K. */ @@ -3349,20 +3093,7 @@ L160: } /* zchk5_ */ -/* Subroutine */ int zprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublecomplex *alpha; -integer *lda, *ldb; -doublecomplex *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -3392,20 +3123,7 @@ return 0; -/* Subroutine */ int zprcn7_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublecomplex *alpha; -integer *lda, *ldb; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { /* Local variables */ @@ -3435,19 +3153,7 @@ return 0; } /* zprcn7_ */ -/* Subroutine */ int zmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset, - transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -doublecomplex *a; -integer *nmax; -doublecomplex *aa; -integer *lda; -logical *reset; -doublecomplex *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; @@ -3456,7 +3162,7 @@ ftnlen diag_len; /* Local variables */ static integer ibeg, iend; - extern /* Double Complex */ VOID zbeg_(); + extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*); static logical unit; static integer i__, j; static logical lower, upper; @@ -3629,27 +3335,7 @@ ftnlen diag_len; } /* zmake_ */ -/* Subroutine */ int zmmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, - beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, - transa_len, transb_len) -char *transa, *transb; -integer *m, *n, *kk; -doublecomplex *alpha, *a; -integer *lda; -doublecomplex *b; -integer *ldb; -doublecomplex *beta, *c__; -integer *ldc; -doublecomplex *ct; -doublereal *g; -doublecomplex *cc; -integer *ldcc; -doublereal *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) { /* System generated locals */ @@ -3658,7 +3344,7 @@ ftnlen transb_len; doublereal d__1, d__2, d__3, d__4, d__5, d__6; doublecomplex z__1, z__2, z__3, z__4; - double sqrt(); + double sqrt(double); /* Local variables */ static doublereal erri; static integer i__, j, k; @@ -4031,9 +3717,7 @@ L250: } /* zmmch_ */ -logical lze_(ri, rj, lr) -doublecomplex *ri, *rj; -integer *lr; +logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr) { /* System generated locals */ integer i__1, i__2, i__3; @@ -4082,13 +3766,7 @@ L30: } /* lze_ */ -logical lzeres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -doublecomplex *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; @@ -4184,9 +3862,7 @@ L80: } /* lzeres_ */ -/* Double Complex */ VOID zbeg_( ret_val, reset) -doublecomplex * ret_val; -logical *reset; +/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset) { /* System generated locals */ doublereal d__1, d__2; @@ -4249,8 +3925,7 @@ L10: } /* zbeg_ */ -doublereal ddiff_(x, y) -doublereal *x, *y; +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c new file mode 100644 index 000000000..0c76f11e7 --- /dev/null +++ b/ctest/c_zblat3c_3m.c @@ -0,0 +1,3951 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#include "common.h" + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 + + +/* Common Block Declarations */ + +struct { + integer infot, noutc; + logical ok, lerr; +} infoc_; + +#define infoc_1 infoc_ + +struct { + char srnamt[12]; +} srnamc_; + +#define srnamc_1 srnamc_ + +/* Table of constant values */ + +static doublecomplex c_b1 = {0.,0.}; +static doublecomplex c_b2 = {1.,0.}; +static integer c__1 = 1; +static integer c__65 = 65; +static doublereal c_b92 = 1.; +static integer c__6 = 6; +static logical c_true = TRUE_; +static integer c__0 = 0; +static logical c_false = FALSE_; + +/* Main program MAIN__() */ int main(void) +{ + /* Initialized data */ + + static char snames[9][13] = { "cblas_zgemm3m ", "cblas_zhemm ", "cblas_zsymm ", "cblas_ztrmm ", + "cblas_ztrsm ", "cblas_zherk ", "cblas_zsyrk ", "cblas_zher2k", "cblas_zsyr2k"}; + + /* System generated locals */ + integer i__1, i__2, i__3, i__4, i__5; + doublereal d__1; + + /* Builtin functions */ + integer s_rsle(void), do_lio(void), e_rsle(void), f_open(void), s_wsfe(void), do_fio(void), + e_wsfe(void), s_wsle(void), e_wsle(void), s_rsfe(void), e_rsfe(void); + + /* Local variables */ + static integer nalf, idim[9]; + static logical same; + static integer nbet, ntra; + static logical rewi; + extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + static doublecomplex c__[4225] /* was [65][65] */; + static doublereal g[65]; + static integer i__, j; + extern doublereal ddiff_(doublereal*, doublereal*); + static integer n; + static logical fatal; + static doublecomplex w[130]; + static logical trace; + static integer nidim; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static char snaps[32]; + static integer isnum; + static logical ltest[9]; + static doublecomplex aa[4225], ab[8450] /* was [65][130] */, bb[4225], + cc[4225], as[4225], bs[4225], cs[4225], ct[65]; + static logical sfatal, corder; + static char snamet[12], transa[1], transb[1]; + static doublereal thresh; + static logical rorder; + static integer layout; + static logical ltestt, tsterr; + extern /* Subroutine */ int cz3chke_(char*, ftnlen); + static doublecomplex alf[7], bet[7]; + static doublereal eps, err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + char tmpchar; + +/* Test program for the COMPLEX*16 Level 3 Blas. */ + +/* The program must be driven by a short data file. The first 13 records */ +/* of the file are read using list-directed input, the last 9 records */ +/* are read using the format ( A12,L2 ). An annotated example of a data */ +/* file can be obtained by deleting the first 3 characters from the */ +/* following 22 lines: */ +/* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE */ +/* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */ +/* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */ +/* F LOGICAL FLAG, T TO STOP ON FAILURES. */ +/* T LOGICAL FLAG, T TO TEST ERROR EXITS. */ +/* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */ +/* 16.0 THRESHOLD VALUE OF TEST RATIO */ +/* 6 NUMBER OF VALUES OF N */ +/* 0 1 2 3 5 9 VALUES OF N */ +/* 3 NUMBER OF VALUES OF ALPHA */ +/* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA */ +/* 3 NUMBER OF VALUES OF BETA */ +/* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA */ +/* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. */ + +/* See: */ + +/* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. */ +/* A Set of Level 3 Basic Linear Algebra Subprograms. */ + +/* Technical Memorandum No.88 (Revision 1), Mathematics and */ +/* Computer Science Division, Argonne National Laboratory, 9700 */ +/* South Cass Avenue, Argonne, Illinois 60439, US. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ +/* .. Executable Statements .. */ + + infoc_1.noutc = 6; + +/* Read name and unit number for snapshot output file and open file. */ + + char line[80]; + + fgets(line,80,stdin); + sscanf(line,"'%s'",snaps); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&ntra); +#else + sscanf(line,"%d",&ntra); +#endif + trace = ntra >= 0; + if (trace) { +/* o__1.oerr = 0; + o__1.ounit = ntra; + o__1.ofnmlen = 32; + o__1.ofnm = snaps; + o__1.orl = 0; + o__1.osta = "NEW"; + o__1.oacc = 0; + o__1.ofm = 0; + o__1.oblnk = 0; + f_open(&o__1);*/ + } +/* Read the flag that directs rewinding of the snapshot file. */ + fgets(line,80,stdin); + sscanf(line,"%d",&rewi); + rewi = rewi && trace; +/* Read the flag that directs stopping on any failure. */ + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + sfatal=FALSE_; + if (tmpchar=='T')sfatal=TRUE_; +/* Read the flag that indicates whether error exits are to be tested. */ + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + tsterr=FALSE_; + if (tmpchar=='T')tsterr=TRUE_; +/* Read the flag that indicates whether row-major data layout to be tested. */ + fgets(line,80,stdin); + sscanf(line,"%d",&layout); +/* Read the threshold value of the test ratio */ + fgets(line,80,stdin); + sscanf(line,"%lf",&thresh); + +/* Read and check the parameter values for the tests. */ + +/* Values of N */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%d",&nidim); +#else + sscanf(line,"%d",&nidim); +#endif + if (nidim < 1 || nidim > 9) { + fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9"); + goto L220; + } + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#else + sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#endif + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n"); + goto L220; + } +/* L10: */ + } +/* Values of ALPHA */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nalf); +#else + sscanf(line,"%d",&nalf); +#endif + if (nalf < 1 || nalf > 7) { + fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; + } + fgets(line,80,stdin); + sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i, + &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i); + +/* Values of BETA */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nbet); +#else + sscanf(line,"%d",&nbet); +#endif + if (nalf < 1 || nbet > 7) { + fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; + } + fgets(line,80,stdin); + sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i, + &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i); + +/* Report values of parameters. */ + + printf("TESTS OF THE DOUBLE PRECISION COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n"); + printf(" FOR N"); + for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]); + printf("\n"); + printf(" FOR ALPHA"); + for (i__ =1; i__ <=nalf;++i__) printf(" (%lf,%lf)",alf[i__-1].r,alf[i__-1].i); + printf("\n"); + printf(" FOR BETA"); + for (i__ =1; i__ <=nbet;++i__) printf(" (%lf,%lf)",bet[i__-1].r,bet[i__-1].i); + printf("\n"); + + if (! tsterr) { + printf(" ERROR-EXITS WILL NOT BE TESTED\n"); + } + + printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %lf\n",thresh); + rorder = FALSE_; + corder = FALSE_; + if (layout == 2) { + rorder = TRUE_; + corder = TRUE_; + printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n"); + } else if (layout == 1) { + rorder = TRUE_; + printf("ROW-MAJOR DATA LAYOUT IS TESTED\n"); + } else if (layout == 0) { + corder = TRUE_; + printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n"); + } + +/* Read names of subroutines and flags which indicate */ +/* whether they are to be tested. */ + + for (i__ = 1; i__ <= 9; ++i__) { + ltest[i__ - 1] = FALSE_; +/* L20: */ + } +L30: + if (! fgets(line,80,stdin)) { + goto L60; + } + i__1 = sscanf(line,"%12c %c",snamet,&tmpchar); + ltestt=FALSE_; + if (tmpchar=='T')ltestt=TRUE_; + if (i__1 < 2) { + goto L60; + } + for (i__ = 1; i__ <= 9; ++i__) { + if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == + 0) { + goto L50; + } +/* L40: */ + } + printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet); + exit(1); +L50: + ltest[i__ - 1] = ltestt; + goto L30; + +L60: +/* cl__1.cerr = 0; + cl__1.cunit = 5; + cl__1.csta = 0; + f_clos(&cl__1);*/ + +/* Compute EPS (the machine precision). */ + + eps = 1.; +L70: + d__1 = eps + 1.; + if (ddiff_(&d__1, &c_b92) == 0.) { + goto L80; + } + eps *= .5; + goto L70; +L80: + eps += eps; + printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps); + +/* Check the reliability of ZMMCH using exact data. */ + + n = 32; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * 65 - 66; +/* Computing MAX */ + i__5 = i__ - j + 1; + i__4 = f2cmax(i__5,0); + ab[i__3].r = (doublereal) i__4, ab[i__3].i = 0.; +/* L90: */ + } + i__2 = j + 4224; + ab[i__2].r = (doublereal) j, ab[i__2].i = 0.; + i__2 = (j + 65) * 65 - 65; + ab[i__2].r = (doublereal) j, ab[i__2].i = 0.; + i__2 = j - 1; + c__[i__2].r = 0., c__[i__2].i = 0.; +/* L100: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j - 1; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.; +/* L110: */ + } +/* CC holds the exact result. On exit from ZMMCH CT holds */ +/* the result computed by ZMMCH. */ + *(unsigned char *)transa = 'N'; + *(unsigned char *)transb = 'N'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true, (ftnlen)1, (ftnlen)1); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + *(unsigned char *)transb = 'C'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true, (ftnlen)1, (ftnlen)1); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j + 4224; + i__3 = n - j + 1; + ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.; + i__2 = (j + 65) * 65 - 65; + i__3 = n - j + 1; + ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.; +/* L120: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n - j; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.; +/* L130: */ + } + *(unsigned char *)transa = 'C'; + *(unsigned char *)transb = 'N'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true, (ftnlen)1, (ftnlen)1); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + *(unsigned char *)transb = 'C'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true, (ftnlen)1, (ftnlen)1); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + +/* Test each subroutine in turn. */ + + for (isnum = 1; isnum <= 9; ++isnum) { + if (! ltest[isnum - 1]) { +/* Subprogram is not to be tested. */ + printf("%12s WAS NOT TESTED\n",snames[isnum-1]); + } else { + s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, ( + ftnlen)12); +/* Test error exits. */ + if (tsterr) { + cz3chke_(snames[isnum - 1], (ftnlen)12); + } +/* Test computations. */ + infoc_1.infot = 0; + infoc_1.ok = TRUE_; + fatal = FALSE_; + switch ((int)isnum) { + case 1: goto L140; + case 2: goto L150; + case 3: goto L150; + case 4: goto L160; + case 5: goto L160; + case 6: goto L170; + case 7: goto L170; + case 8: goto L180; + case 9: goto L180; + } +/* Test ZGEMM, 01. */ +L140: + if (corder) { + zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0, (ftnlen)12); + } + if (rorder) { + zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1, (ftnlen)12); + } + goto L190; +/* Test ZHEMM, 02, ZSYMM, 03. */ +L150: + if (corder) { + zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0, (ftnlen)12); + } + if (rorder) { + zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1, (ftnlen)12); + } + goto L190; +/* Test ZTRMM, 04, ZTRSM, 05. */ +L160: + if (corder) { + zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__0, (ftnlen)12); + } + if (rorder) { + zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__1, (ftnlen)12); + } + goto L190; +/* Test ZHERK, 06, ZSYRK, 07. */ +L170: + if (corder) { + zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0, (ftnlen)12); + } + if (rorder) { + zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1, (ftnlen)12); + } + goto L190; +/* Test ZHER2K, 08, ZSYR2K, 09. */ +L180: + if (corder) { + zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__0, (ftnlen)12); + } + if (rorder) { + zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__1, (ftnlen)12); + } + goto L190; + +L190: + if (fatal && sfatal) { + goto L210; + } + } +/* L200: */ + } + printf("\nEND OF TESTS\n"); + goto L230; + +L210: + printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n"); + goto L230; + +L220: + printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n"); + printf("****** TESTS ABANDONED ******\n"); + +L230: + if (trace) { +/* cl__1.cerr = 0; + cl__1.cunit = ntra; + cl__1.csta = 0; + f_clos(&cl__1);*/ + } +/* cl__1.cerr = 0; + cl__1.cunit = 6; + cl__1.csta = 0; + f_clos(&cl__1);*/ + exit(0); + +/* End of ZBLAT3. */ + +} /* MAIN__ */ + +/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char ich[3+1] = "NTC"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7, i__8; + + /* Local variables */ + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same, null; + static integer i__, k, m, n; + static doublecomplex alpha; + static logical isame[13], trana, tranb; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static integer ia, ib; + extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + extern /* Subroutine */ void czgemm3m_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char tranas[1], tranbs[1], transa[1], transb[1]; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + static doublecomplex als, bls; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZGEMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ +/* .. Executable Statements .. */ + + nargs = 13; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; + + i__3 = *nidim; + for (ik = 1; ik <= i__3; ++ik) { + k = idim[ik]; + + for (ica = 1; ica <= 3; ++ica) { + *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] + ; + trana = *(unsigned char *)transa == 'T' || *(unsigned + char *)transa == 'C'; + + if (trana) { + ma = k; + na = m; + } else { + ma = m; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ + 1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, ( + ftnlen)1); + + for (icb = 1; icb <= 3; ++icb) { + *(unsigned char *)transb = *(unsigned char *)&ich[icb + - 1]; + tranb = *(unsigned char *)transb == 'T' || *(unsigned + char *)transb == 'C'; + + if (tranb) { + mb = n; + nb = k; + } else { + mb = k; + nb = n; + } +/* Set LDB to 1 more than minimum value if room. */ + ldb = mb; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L70; + } + lbb = ldb * nb; + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & + bb[1], &ldb, &reset, &c_b1, (ftnlen)2, ( + ftnlen)1, (ftnlen)1); + + i__4 = *nalf; + for (ia = 1; ia <= i__4; ++ia) { + i__5 = ia; + alpha.r = alf[i__5].r, alpha.i = alf[i__5].i; + + i__5 = *nbet; + for (ib = 1; ib <= i__5; ++ib) { + i__6 = ib; + beta.r = bet[i__6].r, beta.i = bet[i__6].i; + +/* Generate the matrix C. */ + + zmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)tranbs = *(unsigned char *) + transb; + ms = m; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__6 = laa; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + as[i__7].r = aa[i__8].r, as[i__7].i = aa[ + i__8].i; +/* L10: */ + } + ldas = lda; + i__6 = lbb; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[ + i__8].i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__6 = lcc; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[ + i__8].i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + zprcn1_(ntra, &nc, sname, iorder, transa, + transb, &m, &n, &k, &alpha, &lda, + &ldb, &beta, &ldc, (ftnlen)12, ( + ftnlen)1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czgemm3m_(iorder, transa, transb, &m, &n, &k, & + alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc, (ftnlen)1, ( + ftnlen)1); + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)transa == *( + unsigned char *)tranas; + isame[1] = *(unsigned char *)transb == *( + unsigned char *)tranbs; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = ks == k; + isame[5] = als.r == alpha.r && als.i == + alpha.i; + isame[6] = lze_(&as[1], &aa[1], &laa); + isame[7] = ldas == lda; + isame[8] = lze_(&bs[1], &bb[1], &lbb); + isame[9] = ldbs == ldb; + isame[10] = bls.r == beta.r && bls.i == + beta.i; + if (null) { + isame[11] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[11] = lzeres_("ge", " ", &m, &n, & + cs[1], &cc[1], &ldc, (ftnlen)2, ( + ftnlen)1); + } + isame[12] = ldcs == ldc; + +/* If data was incorrectly changed, report */ +/* and return. */ + + same = TRUE_; + i__6 = nargs; + for (i__ = 1; i__ <= i__6; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result. */ + + zmmch_(transa, transb, &m, &n, &k, &alpha, + &a[a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], + nmax, &ct[1], &g[1], &cc[1], &ldc, + eps, &err, fatal, nout, &c_true, + (ftnlen)1, (ftnlen)1); + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L120; + } + } + +/* L50: */ + } + +/* L60: */ + } + +L70: + ; + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* L110: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L130; + +L120: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & + lda, &ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + +L130: + return 0; + +/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */ +/* $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */ +/* $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */ + +/* End of ZCHK1. */ + +} /* zchk1_ */ + + +/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) +{ + /* Local variables */ + static char crc[14], cta[14], ctb[14]; + + if (*(unsigned char *)transa == 'N') { + s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(cta, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transb == 'N') { + s_copy(ctb, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transb == 'T') { + s_copy(ctb, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb); + printf("%d %d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; +} /* zprcn1_ */ + + +/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char ichs[2+1] = "LR"; + static char ichu[2+1] = "UL"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + + /* Local variables */ + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same; + static char side[1]; + static logical isconj, left, null; + static char uplo[1]; + static integer i__, m, n; + static doublecomplex alpha; + static logical isame[13]; + static char sides[1]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static char uplos[1]; + static integer ia, ib; + extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer na, nc, im, in, ms, ns; + extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lbb, lda, lcc, ldb, ldc, ics; + static doublecomplex als, bls; + static integer icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZHEMM and ZSYMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L90; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L90; + } + lbb = ldb * n; + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & + reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + +/* Generate the hermitian or symmetric matrix A. */ + + zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, + &aa[1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen) + 1, (ftnlen)1); + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + +/* Generate the matrix C. */ + + zmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *)side; + *(unsigned char *)uplos = *(unsigned char *)uplo; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + zprcn2_(ntra, &nc, sname, iorder, side, uplo, + &m, &n, &alpha, &lda, &ldb, &beta, & + ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1) + ; + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + if (isconj) { + czhemm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc, (ftnlen)1, (ftnlen)1); + } else { + czsymm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc, (ftnlen)1, (ftnlen)1); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L110; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *(unsigned + char *)side; + isame[1] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lze_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + isame[9] = bls.r == beta.r && bls.i == beta.i; + if (null) { + isame[10] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lzeres_("ge", " ", &m, &n, &cs[1], + &cc[1], &ldc, (ftnlen)2, (ftnlen)1); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L110; + } + + if (! null) { + +/* Check the result. */ + + if (left) { + zmmch_("N", "N", &m, &n, &m, &alpha, &a[ + a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } else { + zmmch_("N", "N", &m, &n, &n, &alpha, &b[ + b_offset], nmax, &a[a_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +L90: + ; + } + +/* L100: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L120; + +L110: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, + &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + +L120: + return 0; + +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ + +/* End of ZCHK2. */ + +} /* zchk2_ */ + + +/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) +{ + /* Local variables */ + static char cs[14], cu[14], crc[14]; + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; +} /* zprcn2_ */ + + +/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char ichu[2+1] = "UL"; + static char icht[3+1] = "NTC"; + static char ichd[2+1] = "UN"; + static char ichs[2+1] = "LR"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + doublecomplex z__1; + + /* Local variables */ + static char diag[1]; + static integer ldas, ldbs; + static logical same; + static char side[1]; + static logical left, null; + static char uplo[1]; + static integer i__, j, m, n; + static doublecomplex alpha; + static char diags[1]; + static logical isame[13]; + static char sides[1]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static char uplos[1]; + static integer ia, na; + extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen); + static integer nc, im, in, ms, ns; + static char tranas[1], transa[1]; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + static integer laa, icd, lbb, lda, ldb, ics; + static doublecomplex als; + static integer ict, icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZTRMM and ZTRSM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --g; + --ct; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ +/* .. Executable Statements .. */ + + nargs = 11; + nc = 0; + reset = TRUE_; + errmax = 0.; +/* Set up zero matrix for ZMMCH. */ + i__1 = *nmax; + for (j = 1; j <= i__1; ++j) { + i__2 = *nmax; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * c_dim1; + c__[i__3].r = 0., c__[i__3].i = 0.; +/* L10: */ + } +/* L20: */ + } + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L130; + } + lbb = ldb * n; + null = m <= 0 || n <= 0; + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L130; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + + for (ict = 1; ict <= 3; ++ict) { + *(unsigned char *)transa = *(unsigned char *)&icht[ + ict - 1]; + + for (icd = 1; icd <= 2; ++icd) { + *(unsigned char *)diag = *(unsigned char *)&ichd[ + icd - 1]; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + +/* Generate the matrix A. */ + + zmake_("tr", uplo, diag, &na, &na, &a[ + a_offset], nmax, &aa[1], &lda, &reset, + &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &m, &n, &b[b_offset], + nmax, &bb[1], &ldb, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *) + side; + *(unsigned char *)uplos = *(unsigned char *) + uplo; + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)diags = *(unsigned char *) + diag; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__4 = laa; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + as[i__5].r = aa[i__6].r, as[i__5].i = aa[ + i__6].i; +/* L30: */ + } + ldas = lda; + i__4 = lbb; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[ + i__6].i; +/* L40: */ + } + ldbs = ldb; + +/* Call the subroutine. */ + + if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen) + 2) == 0) { + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + cztrmm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb, (ftnlen)1, (ftnlen)1, ( + ftnlen)1, (ftnlen)1); + } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( + ftnlen)2) == 0) { + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + cztrsm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb, (ftnlen)1, (ftnlen)1, ( + ftnlen)1, (ftnlen)1); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *( + unsigned char *)side; + isame[1] = *(unsigned char *)uplos == *( + unsigned char *)uplo; + isame[2] = *(unsigned char *)tranas == *( + unsigned char *)transa; + isame[3] = *(unsigned char *)diags == *( + unsigned char *)diag; + isame[4] = ms == m; + isame[5] = ns == n; + isame[6] = als.r == alpha.r && als.i == + alpha.i; + isame[7] = lze_(&as[1], &aa[1], &laa); + isame[8] = ldas == lda; + if (null) { + isame[9] = lze_(&bs[1], &bb[1], &lbb); + } else { + isame[9] = lzeres_("ge", " ", &m, &n, &bs[ + 1], &bb[1], &ldb, (ftnlen)2, ( + ftnlen)1); + } + isame[10] = ldbs == ldb; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__4 = nargs; + for (i__ = 1; i__ <= i__4; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L50: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + if (s_cmp(sname + 9, "mm", (ftnlen)2, ( + ftnlen)2) == 0) { + +/* Check the result. */ + + if (left) { + zmmch_(transa, "N", &m, &n, &m, & + alpha, &a[a_offset], nmax, + &b[b_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } else { + zmmch_("N", transa, &m, &n, &n, & + alpha, &b[b_offset], nmax, + &a[a_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } + } else if (s_cmp(sname + 9, "sm", (ftnlen) + 2, (ftnlen)2) == 0) { + +/* Compute approximation to original */ +/* matrix. */ + + i__4 = n; + for (j = 1; j <= i__4; ++j) { + i__5 = m; + for (i__ = 1; i__ <= i__5; ++i__) + { + i__6 = i__ + j * c_dim1; + i__7 = i__ + (j - 1) * ldb; + c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; + i__6 = i__ + (j - 1) * ldb; + i__7 = i__ + j * b_dim1; + z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + z__1.i = alpha.r * b[i__7].i + alpha.i * b[ + i__7].r; + bb[i__6].r = z__1.r, bb[i__6].i = z__1.i; +/* L60: */ + } +/* L70: */ + } + + if (left) { + zmmch_(transa, "N", &m, &n, &m, & + c_b2, &a[a_offset], nmax, + &c__[c_offset], nmax, & + c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false, (ftnlen)1, + (ftnlen)1); + } else { + zmmch_("N", transa, &m, &n, &n, & + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false, (ftnlen)1, + (ftnlen)1); + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L150; + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +/* L110: */ + } + +/* L120: */ + } + +L130: + ; + } + +/* L140: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L160; + +L150: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & + alpha, &lda, &ldb, (ftnlen)12, (ftnlen)1, (ftnlen)1, (ftnlen) + 1, (ftnlen)1); + } + +L160: + return 0; + +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', */ +/* $ ' .' ) */ + +/* End of ZCHK3. */ + +} /* zchk3_ */ + + +/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) +{ + + /* Local variables */ + static char ca[14], cd[14], cs[14], cu[14], crc[14]; + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)diag == 'N') { + s_copy(cd, " CblasNonUnit", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cd, " CblasUnit", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf(" %s %s %d %d (%4.1lf,%4.1lf) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb); + +return 0; +} /* zprcn3_ */ + + +/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char icht[2+1] = "NC"; + static char ichu[2+1] = "UL"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + doublecomplex z__1; + + /* Local variables */ + static doublecomplex beta; + static integer ldas, ldcs; + static logical same, isconj; + static doublecomplex bets; + static doublereal rals; + static logical tran, null; + static char uplo[1]; + static integer i__, j, k, n; + static doublecomplex alpha; + static doublereal rbeta; + static logical isame[13]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static doublereal rbets; + static logical reset; + static char trans[1]; + static logical upper; + static char uplos[1]; + static integer ia, ib, jc, ma, na; + extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer nc; + extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + static integer ik, in, jj, lj, ks, ns; + static doublereal ralpha; + extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char transs[1], transt[1]; + extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lda, lcc, ldc; + static doublecomplex als; + static integer ict, icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZHERK and ZSYRK. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 10; + nc = 0; + reset = TRUE_; + errmax = 0.; + rals = 1.; + rbets = 1.; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! isconj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + if (isconj) { + ralpha = alpha.r; + z__1.r = ralpha, z__1.i = 0.; + alpha.r = z__1.r, alpha.i = z__1.i; + } + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (isconj) { + rbeta = beta.r; + z__1.r = rbeta, z__1.i = 0.; + beta.r = z__1.r, beta.i = z__1.i; + } + null = n <= 0; + if (isconj) { + null = null ||( (k <= 0 || ralpha == 0.) && + rbeta == 1.); + } + +/* Generate the matrix C. */ + + zmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + if (isconj) { + rals = ralpha; + } else { + als.r = alpha.r, als.i = alpha.i; + } + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + if (isconj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L20: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (isconj) { + if (*trace) { + zprcn6_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &ralpha, &lda, & + rbeta, &ldc, (ftnlen)12, (ftnlen) + 1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czherk_(iorder, uplo, trans, &n, &k, &ralpha, + &aa[1], &lda, &rbeta, &cc[1], &ldc, ( + ftnlen)1, (ftnlen)1); + } else { + if (*trace) { + zprcn4_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, & + beta, &ldc, (ftnlen)12, (ftnlen)1, + (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czsyrk_(iorder, uplo, trans, &n, &k, &alpha, & + aa[1], &lda, &beta, &cc[1], &ldc, ( + ftnlen)1, (ftnlen)1); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + if (isconj) { + isame[4] = rals == ralpha; + } else { + isame[4] = als.r == alpha.r && als.i == + alpha.i; + } + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + if (isconj) { + isame[7] = rbets == rbeta; + } else { + isame[7] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[8] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[8] = lzeres_(sname + 7, uplo, &n, &n, & + cs[1], &cc[1], &ldc, (ftnlen)2, ( + ftnlen)1); + } + isame[9] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L30: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result column by column. */ + + if (isconj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + zmmch_(transt, "N", &lj, &c__1, &k, & + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); + } else { + zmmch_("N", transt, &lj, &c__1, &k, & + alpha, &a[jj + a_dim1], nmax, + &a[j + a_dim1], nmax, &beta, & + c__[jj + j * c_dim1], nmax, & + ct[1], &g[1], &cc[jc], &ldc, + eps, &err, fatal, nout, & + c_true, (ftnlen)1, (ftnlen)1); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } +/* L40: */ + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L130; + +L110: + if (n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); + } + +L120: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (isconj) { + zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, + &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + } else { + zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + } + +L130: + return 0; + +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', */ +/* $ ' .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */ +/* $ '), C,', I3, ') .' ) */ + +/* End of CCHK4. */ + +} /* zchk4_ */ + + +/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) +{ + /* Local variables */ + static char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d (%4.1lf,%4.1lf) A %d (%4.1lf,%4.1lf) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc); + +return 0; +} /* zprcn4_ */ + + + +/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) +{ + + /* Local variables */ + static char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d %4.1lf A %d %4.1lf C %d\n",*n,*k,*alpha,*lda,*beta,*ldc); + +return 0; +} /* zprcn6_ */ + + +/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char icht[2+1] = "NC"; + static char ichu[2+1] = "UL"; + + /* System generated locals */ + integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; + doublecomplex z__1, z__2; + + /* Local variables */ + static integer jjab; + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same, isconj; + static doublecomplex bets; + static logical tran, null; + static char uplo[1]; + static integer i__, j, k, n; + static doublecomplex alpha; + static doublereal rbeta; + static logical isame[13]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static doublereal rbets; + static logical reset; + static char trans[1]; + static logical upper; + static char uplos[1]; + static integer ia, ib, jc, ma, na, nc; + extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + static integer ik, in, jj, lj, ks, ns; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char transs[1], transt[1]; + extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lbb, lda, lcc, ldb, ldc; + static doublecomplex als; + static integer ict, icu; + extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZHER2K and ZSYR2K. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --w; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + --as; + --aa; + --ab; + + /* Function Body */ +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L130; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! isconj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L110; + } + laa = lda * na; + +/* Generate the matrix A. */ + + if (tran) { + i__3 = *nmax << 1; + zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); + } else { + zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); + } + +/* Generate the matrix B. */ + + ldb = lda; + lbb = laa; + if (tran) { + i__3 = *nmax << 1; + zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] + , &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen)1, ( + ftnlen)1); + } else { + zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, + &bb[1], &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen) + 1, (ftnlen)1); + } + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (isconj) { + rbeta = beta.r; + z__1.r = rbeta, z__1.i = 0.; + beta.r = z__1.r, beta.i = z__1.i; + } + null = n <= 0; + if (isconj) { + null = null ||( (k <= 0 || (alpha.r == 0. && + alpha.i == 0.)) && rbeta == 1.); + } + +/* Generate the matrix C. */ + + zmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + if (isconj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (isconj) { + if (*trace) { + zprcn7_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &rbeta, &ldc, (ftnlen)12, ( + ftnlen)1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czher2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &rbeta, & + cc[1], &ldc, (ftnlen)1, (ftnlen)1); + } else { + if (*trace) { + zprcn5_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &beta, &ldc, (ftnlen)12, (ftnlen) + 1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &beta, & + cc[1], &ldc, (ftnlen)1, (ftnlen)1); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lze_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + if (isconj) { + isame[9] = rbets == rbeta; + } else { + isame[9] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[10] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lzeres_("he", uplo, &n, &n, &cs[1] + , &cc[1], &ldc, (ftnlen)2, (ftnlen)1); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + +/* Check the result column by column. */ + + if (isconj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jjab = 1; + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = ((j - 1) << 1) * *nmax + k + + i__; + z__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, + z__1.i = alpha.r * ab[ + i__8].i + alpha.i * ab[ + i__8].r; + w[i__7].r = z__1.r, w[i__7].i = + z__1.i; + if (isconj) { + i__7 = k + i__; + d_cnjg(&z__2, &alpha); + i__8 = ((j - 1) << 1) * *nmax + i__; + z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, + z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[ + i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } else { + i__7 = k + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } +/* L50: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + i__8 = *nmax << 1; + zmmch_(transt, "N", &lj, &c__1, &i__6, + &c_b2, &ab[jjab], &i__7, &w[ + 1], &i__8, &beta, &c__[jj + j + * c_dim1], nmax, &ct[1], &g[1] + , &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); + } else { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + if (isconj) { + i__7 = i__; + d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]); + z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, + z__1.i = alpha.r * z__2.i + alpha.i * + z__2.r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__2.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + d_cnjg(&z__1, &z__2); + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } else { + i__7 = i__; + i__8 = (k + i__ - 1) * *nmax + j; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } +/* L60: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + zmmch_("N", "N", &lj, &c__1, &i__6, & + c_b2, &ab[jj], nmax, &w[1], & + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + if (tran) { + jjab += *nmax << 1; + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L140; + } +/* L70: */ + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +L110: + ; + } + +/* L120: */ + } + +L130: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L160; + +L140: + if (n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); + } + +L150: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (isconj) { + zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + } else { + zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + } + +L160: + return 0; + +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */ +/* $ ', C,', I3, ') .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ + +/* End of ZCHK5. */ + +} /* zchk5_ */ + + +/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) +{ + /* Local variables */ + static char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; +} /* zprcn5_ */ + + + +/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) +{ + + /* Local variables */ + static char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1lf,%4.1lf), A, %d, B, %d, %4.1lf, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc); + +return 0; +} /* zprcn7_ */ + + +/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + doublereal d__1; + doublecomplex z__1, z__2; + + /* Local variables */ + static integer ibeg, iend; + extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*); + static logical unit; + static integer i__, j; + static logical lower, upper; + static integer jj; + static logical gen, her, tri, sym; + + +/* Generates values for an M by N matrix A. */ +/* Stores the values in the array AA in the data structure required */ +/* by the routine, with unwanted elements set to rogue value. */ + +/* TYPE is 'ge', 'he', 'sy' or 'tr'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. External Functions .. */ +/* .. Intrinsic Functions .. */ +/* .. Executable Statements .. */ + /* Parameter adjustments */ + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --aa; + + /* Function Body */ + gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0; + her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0; + sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0; + tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0; + upper = (her || sym || tri) && *(unsigned char *)uplo == 'U'; + lower = (her || sym || tri) && *(unsigned char *)uplo == 'L'; + unit = tri && *(unsigned char *)diag == 'U'; + +/* Generate data in array A. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + if (gen || (upper && i__ <= j) || (lower && i__ >= j)) { + i__3 = i__ + j * a_dim1; + zbeg_(&z__2, reset); + z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i; + a[i__3].r = z__1.r, a[i__3].i = z__1.i; + if (i__ != j) { +/* Set some elements to zero */ + if (*n > 3 && j == *n / 2) { + i__3 = i__ + j * a_dim1; + a[i__3].r = 0., a[i__3].i = 0.; + } + if (her) { + i__3 = j + i__ * a_dim1; + d_cnjg(&z__1, &a[i__ + j * a_dim1]); + a[i__3].r = z__1.r, a[i__3].i = z__1.i; + } else if (sym) { + i__3 = j + i__ * a_dim1; + i__4 = i__ + j * a_dim1; + a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i; + } else if (tri) { + i__3 = j + i__ * a_dim1; + a[i__3].r = 0., a[i__3].i = 0.; + } + } + } +/* L10: */ + } + if (her) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + d__1 = a[i__3].r; + z__1.r = d__1, z__1.i = 0.; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + } + if (tri) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + z__1.r = a[i__3].r + 1., z__1.i = a[i__3].i + 0.; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + } + if (unit) { + i__2 = j + j * a_dim1; + a[i__2].r = 1., a[i__2].i = 0.; + } +/* L20: */ + } + +/* Store elements in array AS in data structure required by routine. */ + + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L30: */ + } + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L40: */ + } +/* L50: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen) + 2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + if (unit) { + iend = j - 1; + } else { + iend = j; + } + } else { + if (unit) { + ibeg = j + 1; + } else { + ibeg = j; + } + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L60: */ + } + i__2 = iend; + for (i__ = ibeg; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L70: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L80: */ + } + if (her) { + jj = j + (j - 1) * *lda; + i__2 = jj; + i__3 = jj; + d__1 = aa[i__3].r; + z__1.r = d__1, z__1.i = -1e10; + aa[i__2].r = z__1.r, aa[i__2].i = z__1.i; + } +/* L90: */ + } + } + return 0; + +/* End of ZMAKE. */ + +} /* zmake_ */ + +/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) +{ + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; + doublereal d__1, d__2, d__3, d__4, d__5, d__6; + doublecomplex z__1, z__2, z__3, z__4; + + double sqrt(double); + /* Local variables */ + static doublereal erri; + static integer i__, j, k; + static logical trana, tranb, ctrana, ctranb; + +/* Checks the results of the computational tests. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Intrinsic Functions .. */ +/* .. Statement Functions .. */ +/* .. Statement Function definitions .. */ +/* .. Executable Statements .. */ + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --ct; + --g; + cc_dim1 = *ldcc; + cc_offset = 1 + cc_dim1 * 1; + cc -= cc_offset; + + /* Function Body */ + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + 'C'; + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + 'C'; + ctrana = *(unsigned char *)transa == 'C'; + ctranb = *(unsigned char *)transb == 'C'; + +/* Compute expected result, one column at a time, in CT using data */ +/* in A, B and C. */ +/* Compute gauges in G. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + ct[i__3].r = 0., ct[i__3].i = 0.; + g[i__] = 0.; +/* L10: */ + } + if (! trana && ! tranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = k + j * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ + i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag( + &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[ + i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * + b_dim1]), abs(d__4))); +/* L20: */ + } +/* L30: */ + } + } else if (trana && ! tranb) { + if (ctrana) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + i__6 = k + j * b_dim1; + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6] + .r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[k + j * b_dim1]), abs(d__4))); +/* L40: */ + } +/* L50: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = k + j * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[k + j * b_dim1]), abs(d__4))); +/* L60: */ + } +/* L70: */ + } + } + } else if (! trana && tranb) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + d_cnjg(&z__3, &b[j + k * b_dim1]); + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__3.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[j + k * b_dim1]), abs(d__4))); +/* L80: */ + } +/* L90: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = j + k * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[j + k * b_dim1]), abs(d__4))); +/* L100: */ + } +/* L110: */ + } + } + } else if (trana && tranb) { + if (ctrana) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + d_cnjg(&z__4, &b[j + k * b_dim1]); + z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, + z__2.i = z__3.r * z__4.i + z__3.i * + z__4.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L120: */ + } +/* L130: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + i__6 = j + k * b_dim1; + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.i = z__3.r * b[i__6].i + z__3.i * b[ + i__6].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L140: */ + } +/* L150: */ + } + } + } else { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + d_cnjg(&z__3, &b[j + k * b_dim1]); + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__3.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L160: */ + } +/* L170: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = j + k * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ + i__7].i, z__2.i = a[i__6].r * b[i__7].i + + a[i__6].i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L180: */ + } +/* L190: */ + } + } + } + } + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = + alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; + i__5 = i__ + j * c_dim1; + z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = + beta->r * c__[i__5].i + beta->i * c__[i__5].r; + z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; + ct[i__3].r = z__1.r, ct[i__3].i = z__1.i; + i__3 = i__ + j * c_dim1; + g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), + abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + ( + d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, + abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs( + d__6))); +/* L200: */ + } + +/* Compute the error ratio for this result. */ + + *err = 0.; + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__ + j * cc_dim1; + z__2.r = ct[i__3].r - cc[i__4].r, z__2.i = ct[i__3].i - cc[i__4] + .i; + z__1.r = z__2.r, z__1.i = z__2.i; + erri = ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1), abs( + d__2))) / *eps; + if (g[i__] != 0.) { + erri /= g[i__]; + } + *err = f2cmax(*err,erri); + if (*err * sqrt(*eps) >= 1.) { + goto L230; + } +/* L210: */ + } + +/* L220: */ + } + +/* If the loop completes, all results are at least half accurate. */ + goto L250; + +/* Report fatal error. */ + +L230: + *fatal = TRUE_; + printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n"); + printf(" EXPECTED RESULT COMPUTED RESULT\n"); + i__1 = *m; + for (i__ = 1; i__ <= i__1; ++i__) { + if (*mv) { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i); + } else { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i); + } +/* L240: */ + } + if (*n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d\n",j); + } + +L250: + return 0; + + +/* End of ZMMCH. */ + +} /* zmmch_ */ + +logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr) +{ + /* System generated locals */ + integer i__1, i__2, i__3; + logical ret_val; + + /* Local variables */ + static integer i__; + + +/* Tests if two arrays are identical. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Executable Statements .. */ + /* Parameter adjustments */ + --rj; + --ri; + + /* Function Body */ + i__1 = *lr; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) { + goto L20; + } +/* L10: */ + } + ret_val = TRUE_; + goto L30; +L20: + ret_val = FALSE_; +L30: + return ret_val; + +/* End of LZE. */ + +} /* lze_ */ + +logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) +{ + /* System generated locals */ + integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; + logical ret_val; + + /* Local variables */ + static integer ibeg, iend, i__, j; + static logical upper; + + +/* Tests if selected elements in two arrays are equal. */ + +/* TYPE is 'ge' or 'he' or 'sy'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Executable Statements .. */ + /* Parameter adjustments */ + as_dim1 = *lda; + as_offset = 1 + as_dim1 * 1; + as -= as_offset; + aa_dim1 = *lda; + aa_offset = 1 + aa_dim1 * 1; + aa -= aa_offset; + + /* Function Body */ + upper = *(unsigned char *)uplo == 'U'; + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L10: */ + } +/* L20: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + iend = j; + } else { + ibeg = j; + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L30: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L40: */ + } +/* L50: */ + } + } + +/* 60 CONTINUE */ + ret_val = TRUE_; + goto L80; +L70: + ret_val = FALSE_; +L80: + return ret_val; + +/* End of LZERES. */ + +} /* lzeres_ */ + +/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset) +{ + /* System generated locals */ + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + static integer i__, j, ic, mi, mj; + + +/* Generates complex numbers as pairs of random numbers uniformly */ +/* distributed between -0.5 and 0.5. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Scalar Arguments .. */ +/* .. Local Scalars .. */ +/* .. Save statement .. */ +/* .. Intrinsic Functions .. */ +/* .. Executable Statements .. */ + if (*reset) { +/* Initialize local variables. */ + mi = 891; + mj = 457; + i__ = 7; + j = 7; + ic = 0; + *reset = FALSE_; + } + +/* The sequence of values of I or J is bounded between 1 and 999. */ +/* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */ +/* If initial I or J = 4 or 8, the period will be 25. */ +/* If initial I or J = 5, the period will be 10. */ +/* IC is used to break up the period by skipping 1 value of I or J */ +/* in 6. */ + + ++ic; +L10: + i__ *= mi; + j *= mj; + i__ -= i__ / 1000 * 1000; + j -= j / 1000 * 1000; + if (ic >= 5) { + ic = 0; + goto L10; + } + d__1 = (i__ - 500) / 1001.; + d__2 = (j - 500) / 1001.; + z__1.r = d__1, z__1.i = d__2; + ret_val->r = z__1.r, ret_val->i = z__1.i; + return ; + +/* End of ZBEG. */ + +} /* zbeg_ */ + +doublereal ddiff_(doublereal* x, doublereal* y) +{ + /* System generated locals */ + doublereal ret_val; + + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Scalar Arguments .. */ +/* .. Executable Statements .. */ + ret_val = *x - *y; + return ret_val; + +/* End of DDIFF. */ + +} /* ddiff_ */ + +/* Main program alias */ /*int zblat3_ () { MAIN__ (); }*/ diff --git a/docs/distributing.md b/docs/distributing.md new file mode 100644 index 000000000..1e6372a28 --- /dev/null +++ b/docs/distributing.md @@ -0,0 +1,270 @@ +# Guidance for redistributing OpenBLAS + +*We note that this document contains recommendations only - packagers and other +redistributors are in charge of how OpenBLAS is built and distributed in their +systems, and may have good reasons to deviate from the guidance given on this +page. These recommendations are aimed at general packaging systems, with a user +base that typically is large, open source (or freely available at least), and +doesn't behave uniformly or that the packager is directly connected with.* + +OpenBLAS has a large number of build-time options which can be used to change +how it behaves at runtime, how artifacts or symbols are named, etc. Variation +in build configuration can be necessary to acheive a given end goal within a +distribution or as an end user. However, such variation can also make it more +difficult to build on top of OpenBLAS and ship code or other packages in a way +that works across many different distros. Here we provide guidance about the +most important build options, what effects they may have when changed, and +which ones to default to. + +The Make and CMake build systems provide equivalent options and yield more or +less the same artifacts, but not exactly (the CMake builds are still +experimental). You can choose either one and the options will function in the +same way, however the CMake outputs may require some renaming. To review +available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of +the repository. + +Build options typically fall into two categories: (a) options that affect the +user interface, such as library and symbol names or APIs that are made +available, and (b) options that affect performance and runtime behavior, such +as threading behavior or CPU architecture-specific code paths. The user +interface options are more important to keep aligned between distributions, +while for the performance-related options there are typically more reasons to +make choices that deviate from the defaults. + +Here are recommendations for user interface related packaging choices where it +is not likely to be a good idea to deviate (typically these are the default +settings): + +1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect + binary size much, so don't turn it off. +2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and + while it does make up a significant part of the binary size of the installed + library, that does not outweigh the regression in usability when deviating + from the default here.[^1] +3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency + detection files. These files are used by build systems when users want to + link against OpenBLAS, and there is no benefit of leaving them out. +4. Provide the LP64 interface by default, and if in addition to that you choose + to provide an ILP64 interface build as well, use a symbol suffix to avoid + symbol name clashes (see the next section). + +[^1] All major distributions do include LAPACK as of mid 2023 as far as we +know. Older versions of Arch Linux did not, and that was known to cause +problems. + + +## ILP64 interface builds + +The LP64 (32-bit integer) interface is the default build, and has +well-established C and Fortran APIs as determined by the reference (Netlib) +BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does +not have a standard API: symbol names and shared/static library names can be +produced in multiple ways, and this tends to make it difficult to use. +As of today there is an agreed-upon way of choosing names for OpenBLAS between +a number of key users/redistributors, which is the closest thing to a standard +that there is now. However, there is an ongoing standardization effort in the +reference BLAS and LAPACK libraries, which differs from the current OpenBLAS +agreed-upon convention. In this section we'll aim to explain both. + +Those two methods are fairly similar, and have a key thing in common: *using a +symbol suffix*. This is good practice; it is recommended that if you distribute +an ILP64 build, to have it use a symbol suffix containing `64` in the name. +This avoids potential symbol clashes when different packages which depend on +OpenBLAS load both an LP64 and an ILP64 library into memory at the same time. + +### The current OpenBLAS agreed-upon ILP64 convention + +This convention comprises the shared library name and the symbol suffix in the +shared library. The symbol suffix to use is `64_`, implying that the library +name will be `libopenblas64_.so` and the symbols in that library end in `64_`. +The central issue where this was discussed is +[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters +include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well. + +To build shared and static libraries with the currently recommended ILP64 +conventions with Make: +```bash +$ make INTERFACE64=1 SYMBOLSUFFIX=64_ +``` + +This will produce libraries named `libopenblas64_.so|a`, a pkg-config file +named `openblas64.pc`, and CMake and header files. + +Installing locally and inspecting the output will show a few more details: +```bash +$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_ +$ tree . # output slightly edited down +. +├── include +│   ├── cblas.h +│   ├── f77blas.h +│   ├── lapacke_config.h +│   ├── lapacke.h +│   ├── lapacke_mangling.h +│   ├── lapacke_utils.h +│   ├── lapack.h +│   └── openblas_config.h +└── lib + ├── cmake + │   └── openblas + │   ├── OpenBLASConfig.cmake + │   └── OpenBLASConfigVersion.cmake + ├── libopenblas64_.a + ├── libopenblas64_.so + └── pkgconfig + └── openblas64.pc +``` + +A key point are the symbol names. These will equal the LP64 symbol names, then +(for Fortran only) the compiler mangling, and then the `64_` symbol suffix. +Hence to obtain the final symbol names, we need to take into account which +Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel +Fortran, or Flang), that means appending a single underscore. In that case, the +result is: + +| base API name | binary symbol name | call from Fortran code | call from C code | +|---------------|--------------------|------------------------|-----------------------| +| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | +| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` | + +It is quite useful to have these symbol names be as uniform as possible across +different packaging systems. + +The equivalent build options with CMake are: +```bash +$ mkdir build && cd build +$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON +$ cmake --build . -j +``` + +Note that the result is not 100% identical to the Make result. For example, the +library name ends in `_64` rather than `64_` - it is recommended to rename them +to match the Make library names (also update the `libsuffix` entry in +`openblas64.pc` to match that rename). +```bash +$ cmake --install . --prefix $PWD/../../openblas/cmake64 +$ tree . +. +├── include +│   └── openblas64 +│   ├── cblas.h +│   ├── f77blas.h +│   ├── lapacke_config.h +│   ├── lapacke_example_aux.h +│   ├── lapacke.h +│   ├── lapacke_mangling.h +│   ├── lapacke_utils.h +│   ├── lapack.h +│   ├── openblas64 +│   │   └── lapacke_mangling.h +│   └── openblas_config.h +└── lib + ├── cmake + │   └── OpenBLAS64 + │   ├── OpenBLAS64Config.cmake + │   ├── OpenBLAS64ConfigVersion.cmake + │   ├── OpenBLAS64Targets.cmake + │   └── OpenBLAS64Targets-noconfig.cmake + ├── libopenblas_64.a + ├── libopenblas_64.so -> libopenblas_64.so.0 + └── pkgconfig + └── openblas64.pc +``` + + +### The upcoming standardized ILP64 convention + +While the `64_` convention above got some adoption, it's slightly hacky and is +implemented through the use of `objcopy`. An effort is ongoing for a more +broadly adopted convention in the reference BLAS and LAPACK libraries, using +(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after +Fortran compiler mangling. The central issue for this is +[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666). + +For the most common cases of compiler mangling (a single `_` appended), the end +result will be: + +| base API name | binary symbol name | call from Fortran code | call from C code | +|---------------|--------------------|------------------------|-----------------------| +| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | +| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` | + +For other compiler mangling schemes, replace the trailing `_` by the scheme in use. + +The shared library name for this `_64` convention should be `libopenblas_64.so`. + +Note: it is not yet possible to produce an OpenBLAS build which employs this +convention! Once reference BLAS and LAPACK with support for `_64` have been +released, a future OpenBLAS release will support it. For now, please use the +older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be +considered reserved for future use of the `_64` standard as prescribed by +reference BLAS/LAPACK. + + +## Performance and runtime behavior related build options + +For these options there are multiple reasonable or common choices. + +### Threading related options + +OpenBLAS can be built as a multi-threaded or single-threaded library, with the +default being multi-threaded. It's expected that the default `libopenblas` +library is multi-threaded; if you'd like to also distribute single-threaded +builds, consider naming them `libopenblas_sequential`. + +OpenBLAS can be built with pthreads or OpenMP as the threading model, with the +default being pthreads. Both options are commonly used, and the choice here +should not influence the shared library name. The choice will be captured by +the `.pc` file. E.g.,: +```bash +$ pkg-config --libs openblas +-fopenmp -lopenblas + +$ cat openblas.pc +... +openblas_config= ... USE_OPENMP=0 MAX_THREADS=24 +``` + +The maximum number of threads users will be able to use is determined at build +time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide +range of values that are reasonable to use (up to 256). 64 is a typical choice +here; there is a memory footprint penalty that is linear in `NUM_THREADS`. +Please see `Makefile.rule` for more details. + +### CPU architecture related options + +OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when +distributing to a user base with a variety of hardware, it is recommended to +enable CPU architecture runtime detection. This will dynamically select +optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1` +build option. This is usually done on all common CPU families, except when +there are known issues. + +In case the CPU architecture is known (e.g. you're building binaries for macOS +M1 users), it is possible to specify the target architecture directly with the +`TARGET=` build option. + +`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md` +in this repository. + + +## Real-world examples + +OpenBLAS is likely to be distributed in one of these distribution models: + +1. As a standalone package, or multiple packages, in a packaging ecosystem like + a Linux distro, Homebrew, conda-forge or MSYS2. +2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R. +3. Locally, e.g. making available as a build on a single HPC cluster. + +The guidance on this page is most important for models (1) and (2). These links +to build recipes for a representative selection of packaging systems may be +helpful as a reference: + +- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec) +- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules) +- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb) +- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD) +- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh) +- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh) +- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix) diff --git a/driver/level3/gemm_thread_m.c b/driver/level3/gemm_thread_m.c index 8813e5529..353ae0be9 100644 --- a/driver/level3/gemm_thread_m.c +++ b/driver/level3/gemm_thread_m.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 6b52df884..4f370999a 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -60,7 +60,7 @@ static const int divide_rule[][2] = { 1, 61}, { 2, 31}, { 7, 9}, { 8, 8}, }; -int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index 9668841bb..d583456bd 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c index 162a75f70..75e49cb1a 100644 --- a/driver/level3/gemm_thread_variable.c +++ b/driver/level3/gemm_thread_variable.c @@ -42,7 +42,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, - int (*function)(), void *sa, void *sb, BLASLONG divM, BLASLONG divN) { + int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG divM, BLASLONG divN) { blas_queue_t queue[MAX_CPU_NUMBER]; diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 39824fc5a..26d07fa94 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG divN, divT; int mode; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif + if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); @@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } */ - if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) { GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } @@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO divT = nthreads; divN = 1; - while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { + while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) { do { divT --; divN = 1; diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index d7dcd68a3..b03577fb3 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO int mode, mask; double dnum, di, dinum; - if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif + + if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) { SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ff32a74a9..c7ccf8426 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - #ifndef GEMM_PREFERED_SIZE #define GEMM_PREFERED_SIZE 1 #endif @@ -595,6 +592,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG width, i, j, k, js; BLASLONG m, n, n_from, n_to; int mode; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif /* Get execution mode */ #ifndef COMPLEX @@ -732,8 +734,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (n > 0){ width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < SWITCH_RATIO) { - width = SWITCH_RATIO; + if (width < switch_ratio) { + width = switch_ratio; } width = round_up(n, width, GEMM_PREFERED_SIZE); @@ -792,6 +794,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG nthreads_m, nthreads_n; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif /* Get dimensions from index ranges if available */ if (range_m) { @@ -801,21 +808,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF n = range_n[1] - range_n[0]; } - /* Partitions in m should have at least SWITCH_RATIO rows */ - if (m < 2 * SWITCH_RATIO) { + /* Partitions in m should have at least switch_ratio rows */ + if (m < 2 * switch_ratio) { nthreads_m = 1; } else { nthreads_m = args -> nthreads; - while (m < nthreads_m * SWITCH_RATIO) { + while (m < nthreads_m * switch_ratio) { nthreads_m = nthreads_m / 2; } } - /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */ - if (n < SWITCH_RATIO * nthreads_m) { + /* Partitions in n should have at most switch_ratio * nthreads_m columns */ + if (n < switch_ratio * nthreads_m) { nthreads_n = 1; } else { - nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m); + nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m); if (nthreads_m * nthreads_n > args -> nthreads) { nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); } diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 12808afd5..a40122e38 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -41,7 +41,7 @@ #include #include "common.h" -int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c index 06039c952..01b254f5d 100644 --- a/driver/others/blas_l1_thread.c +++ b/driver/others/blas_l1_thread.c @@ -43,7 +43,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads){ + void *c, BLASLONG ldc, int (*function)(void), int nthreads){ blas_queue_t queue[MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER]; @@ -141,7 +141,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads){ + void *c, BLASLONG ldc, int (*function)(void), int nthreads){ blas_queue_t queue[MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER]; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 051513f27..2531c57e9 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -93,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -extern unsigned int openblas_thread_timeout(); +extern unsigned int openblas_thread_timeout(void); #ifdef SMP_SERVER @@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(); /* We need this global for checking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; +int blas_omp_threads_local = 1; + /* Local Variables */ #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; @@ -973,7 +975,7 @@ void goto_set_num_threads(int num_threads) { increased_threads = 1; - for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index bcd9c29b5..23110f807 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -68,8 +68,10 @@ #endif int blas_server_avail = 0; +int blas_omp_number_max = 0; +int blas_omp_threads_local = 1; -extern int openblas_omp_adaptive_env(); +extern int openblas_omp_adaptive_env(void); static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; #ifdef HAVE_C11 @@ -78,7 +80,7 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #endif -static void adjust_thread_buffers() { +static void adjust_thread_buffers(void) { int i=0, j=0; @@ -100,8 +102,6 @@ static void adjust_thread_buffers() { void goto_set_num_threads(int num_threads) { - blas_num_threads_set = 1; - if (num_threads < 0) blas_num_threads_set = 0; if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -126,6 +126,17 @@ void openblas_set_num_threads(int num_threads) { int blas_thread_init(void){ +#if defined(__FreeBSD__) && defined(__clang__) +extern int openblas_omp_num_threads_env(void); + + if(blas_omp_number_max <= 0) + blas_omp_number_max= openblas_omp_num_threads_env(); + if (blas_omp_number_max <= 0) + blas_omp_number_max=MAX_CPU_NUMBER; +#else + blas_omp_number_max = omp_get_max_threads(); +#endif + blas_get_cpu_number(); adjust_thread_buffers(); diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index afa33cccc..2ad8b8c5f 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -48,34 +48,38 @@ #endif #endif +#ifdef SMP_DEBUG +# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__) +#else +# define MT_TRACE(...) +#endif + /* This is a thread implementation for Win32 lazy implementation */ /* Thread server common information */ -typedef struct{ - CRITICAL_SECTION lock; - HANDLE filled; - HANDLE killed; - - blas_queue_t *queue; /* Parameter Pointer */ - int shutdown; /* server shutdown flag */ -} blas_pool_t; +static blas_queue_t *work_queue = NULL; +static HANDLE kickoff_event = NULL; +static CRITICAL_SECTION queue_lock; /* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; +int blas_omp_threads_local = 1; + /* Local Variables */ static BLASULONG server_lock = 0; -static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; +static volatile int thread_target; // target num of live threads, volatile for cross-thread reads +// +// Legacy code path +// +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { - -static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ - - if (!(mode & BLAS_COMPLEX)){ + if (!(mode & BLAS_COMPLEX)) { #ifdef EXPRECISION if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ @@ -90,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if ((mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE) { /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -101,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ + } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, @@ -113,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> b, args -> ldb, args -> c, args -> ldc, sb); #ifdef BUILD_BFLOAT16 - } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, @@ -124,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + } else if ((mode & BLAS_PREC) == BLAS_STOBF16) { /* REAL / BLAS_STOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, bfloat16 *, BLASLONG, @@ -135,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { /* REAL / BLAS_DTOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, bfloat16 *, BLASLONG, @@ -152,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } else { #ifdef EXPRECISION - if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -166,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if ((mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE) { /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -196,88 +200,78 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -/* This is a main routine of threads. Each thread waits until job is */ -/* queued. */ - -static DWORD WINAPI blas_thread_server(void *arg){ +// +// This is a main routine of threads. Each thread waits until job is queued. +// +static DWORD WINAPI blas_thread_server(void *arg) { /* Thread identifier */ -#ifdef SMP_DEBUG BLASLONG cpu = (BLASLONG)arg; -#endif void *buffer, *sa, *sb; blas_queue_t *queue; - DWORD action; - HANDLE handles[] = {pool.filled, pool.killed}; /* Each server needs each buffer */ buffer = blas_memory_alloc(2); -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); -#endif + MT_TRACE("Server[%2ld] Thread is started!\n", cpu); - while (1){ + while (1) { /* Waiting for Queue */ -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); -#endif + MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); - do { - action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); - } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); + // event raised when work is added to the queue + WaitForSingleObject(kickoff_event, INFINITE); - if (action == WAIT_OBJECT_0 + 1) break; + if (cpu > thread_target - 2) { + //MT_TRACE("thread [%d] exiting.\n", cpu); + break; // excess thread, so worker thread exits + } -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); -#endif + MT_TRACE("Server[%2ld] Got it.\n", cpu); - EnterCriticalSection(&pool.lock); + EnterCriticalSection(&queue_lock); - queue = pool.queue; - if (queue) pool.queue = queue->next; + queue = work_queue; + if (queue) + work_queue = work_queue->next; - LeaveCriticalSection(&pool.lock); + LeaveCriticalSection(&queue_lock); - if (queue) { + if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - if (pool.queue) SetEvent(pool.filled); - sa = queue -> sa; sb = queue -> sb; -#ifdef CONSISTENT_FPCSR - __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); - __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); -#endif + #ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); + #endif -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); -#endif // fprintf(stderr, "queue start[%ld]!!!\n", cpu); -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING1; -#endif + #ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; + #endif - if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + if (sa == NULL) + sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sb == NULL) { - if (!(queue -> mode & BLAS_COMPLEX)){ + if (!(queue -> mode & BLAS_COMPLEX)) { #ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { #ifdef BUILD_DOUBLE sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); @@ -311,70 +305,59 @@ static DWORD WINAPI blas_thread_server(void *arg){ /* Other types in future */ } } - queue->sb=sb; + queue->sb=sb; } -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING2; -#endif + #ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; + #endif if (!(queue -> mode & BLAS_LEGACY)) { - - (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); } else { - legacy_exec(routine, queue -> mode, queue -> args, sb); + legacy_exec(routine, queue -> mode, queue -> args, sb); } - }else{ - continue; //if queue == NULL - } - -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); -#endif - - EnterCriticalSection(&queue->lock); - - queue -> status = BLAS_STATUS_FINISHED; - - LeaveCriticalSection(&queue->lock); + } else { + continue; //if queue == NULL + } - SetEvent(queue->finish); + MT_TRACE("Server[%2ld] Finished!\n", cpu); + + queue->finished = 1; } /* Shutdown procedure */ -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); -#endif + MT_TRACE("Server[%2ld] Shutdown!\n", cpu); blas_memory_free(buffer); return 0; - } +} -/* Initializing routine */ -int blas_thread_init(void){ +// +// Initializing routine +// +int blas_thread_init(void) { BLASLONG i; if (blas_server_avail || (blas_cpu_number <= 1)) return 0; LOCK_COMMAND(&server_lock); -#ifdef SMP_DEBUG - fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", - blas_cpu_number); -#endif + MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); - if (!blas_server_avail){ + if (!blas_server_avail) { + // create the kickoff Event + kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - InitializeCriticalSection(&pool.lock); - pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); - pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + thread_target = blas_cpu_number; - pool.shutdown = 0; - pool.queue = NULL; + InitializeCriticalSection(&queue_lock); + + for(i = 0; i < blas_cpu_number - 1; i++) { + //MT_TRACE("thread_init: creating thread [%d]\n", i); - for(i = 0; i < blas_cpu_number - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, 0, &blas_threads_id[i]); @@ -388,15 +371,12 @@ int blas_thread_init(void){ return 0; } -/* - User can call one of two routines. - - exec_blas_async ... immediately returns after jobs are queued. - - exec_blas ... returns after jobs are finished. -*/ - -int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ +// +// User can call one of two routines. +// exec_blas_async ... immediately returns after jobs are queued. +// exec_blas ... returns after jobs are finished. +// +int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { #if defined(SMP_SERVER) // Handle lazy re-init of the thread-pool after a POSIX fork @@ -409,8 +389,6 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ current = queue; while (current) { - InitializeCriticalSection(¤t -> lock); - current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); current -> position = pos; #ifdef CONSISTENT_FPCSR @@ -418,56 +396,71 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); #endif + current->finished = 0; current = current -> next; pos ++; } - EnterCriticalSection(&pool.lock); + EnterCriticalSection(&queue_lock); - if (pool.queue) { - current = pool.queue; - while (current -> next) current = current -> next; - current -> next = queue; - } else { - pool.queue = queue; + if (!work_queue) + { + work_queue = queue; + } + else + { + blas_queue_t *next_item = work_queue; + + // find the end of the work queue + while (next_item) + next_item = next_item->next; + + // add new work to the end + next_item = queue; } - LeaveCriticalSection(&pool.lock); + LeaveCriticalSection(&queue_lock); - SetEvent(pool.filled); + SetEvent(kickoff_event); return 0; } -int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ +// +// Join. Wait for all queued tasks to complete +// +int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { -#ifdef SMP_DEBUG - fprintf(STDERR, "Synchronization Waiting.\n"); -#endif + MT_TRACE("Synchronization Waiting.\n"); - while (num){ -#ifdef SMP_DEBUG - fprintf(STDERR, "Waiting Queue ..\n"); -#endif + while (num) { + MT_TRACE("Waiting Queue ..\n"); + + while (!queue->finished) + YIELDING; - WaitForSingleObject(queue->finish, INFINITE); + queue = queue->next; + num--; + } - CloseHandle(queue->finish); - DeleteCriticalSection(&queue -> lock); + MT_TRACE("Completely Done.\n\n"); - queue = queue -> next; - num --; - } + // if work was added to the queue after this batch we can't sleep the worker threads + // by resetting the event + EnterCriticalSection(&queue_lock); -#ifdef SMP_DEBUG - fprintf(STDERR, "Completely Done.\n\n"); -#endif + if (work_queue == NULL) + ResetEvent(kickoff_event); - return 0; + LeaveCriticalSection(&queue_lock); + + return 0; } -/* Execute Threads */ -int exec_blas(BLASLONG num, blas_queue_t *queue){ +// +// Execute Threads +// +int exec_blas(BLASLONG num, blas_queue_t *queue) { #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) // Handle lazy re-init of the thread-pool after a POSIX fork @@ -480,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ if ((num <= 0) || (queue == NULL)) return 0; - if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); + if ((num > 1) && queue -> next) + exec_blas_async(1, queue -> next); routine = queue -> routine; if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); - } else + } else { if (queue -> mode & BLAS_PTHREAD) { void (*pthreadcompat)(void *) = queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, - queue -> sa, queue -> sb, 0); + queue -> sa, queue -> sb, 0); + } - if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + if ((num > 1) && queue -> next) + exec_blas_async_wait(num - 1, queue -> next); return 0; } -/* Shutdown procedure, but user don't have to call this routine. The */ -/* kernel automatically kill threads. */ - -int BLASFUNC(blas_thread_shutdown)(void){ +// +// Shutdown procedure, but user don't have to call this routine. The +// kernel automatically kill threads. +// +int BLASFUNC(blas_thread_shutdown)(void) { int i; @@ -510,11 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ LOCK_COMMAND(&server_lock); - if (blas_server_avail){ - - SetEvent(pool.killed); + if (blas_server_avail) { - for(i = 0; i < blas_num_threads - 1; i++){ + for (i = 0; i < blas_num_threads - 1; i++) { // Could also just use WaitForMultipleObjects DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); @@ -528,9 +523,6 @@ int BLASFUNC(blas_thread_shutdown)(void){ CloseHandle(blas_threads[i]); } - CloseHandle(pool.filled); - CloseHandle(pool.killed); - blas_server_avail = 0; } @@ -539,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ return 0; } +// +// Legacy function to set numbef of threads +// void goto_set_num_threads(int num_threads) { long i; @@ -552,23 +547,48 @@ void goto_set_num_threads(int num_threads) if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + if (blas_server_avail && num_threads < blas_num_threads) { + LOCK_COMMAND(&server_lock); + + thread_target = num_threads; + + SetEvent(kickoff_event); + + for (i = num_threads - 1; i < blas_num_threads - 1; i++) { + //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i); + + WaitForSingleObject(blas_threads[i], INFINITE); + + //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i); + + CloseHandle(blas_threads[i]); + } + + blas_num_threads = num_threads; + + ResetEvent(kickoff_event); + + UNLOCK_COMMAND(&server_lock); + } + if (num_threads > blas_num_threads) { LOCK_COMMAND(&server_lock); - //increased_threads = 1; - if (!blas_server_avail){ + thread_target = num_threads; + + //increased_threads = 1; + if (!blas_server_avail) { + // create the kickoff Event + kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - InitializeCriticalSection(&pool.lock); - pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); - pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + InitializeCriticalSection(&queue_lock); - pool.shutdown = 0; - pool.queue = NULL; blas_server_avail = 1; } - for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { + //MT_TRACE("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, @@ -583,6 +603,9 @@ void goto_set_num_threads(int num_threads) blas_cpu_number = num_threads; } +// +// Openblas function to set thread count +// void openblas_set_num_threads(int num) { goto_set_num_threads(num); diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index f61930983..e3f905265 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE; #else #define gotoblas_COOPERLAKE gotoblas_PRESCOTT #endif +#ifdef DYN_SAPPHIRERAPIDS +extern gotoblas_t gotoblas_SAPPHIRERAPIDS; +#elif defined(DYN_SKYLAKEX) +#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX +#elif defined(DYN_HASWELL) +#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM +#else +#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT +#endif #else // not DYNAMIC_LIST @@ -262,15 +275,18 @@ extern gotoblas_t gotoblas_EXCAVATOR; #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE +#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; extern gotoblas_t gotoblas_COOPERLAKE; +extern gotoblas_t gotoblas_SAPPHIRERAPIDS; #else #define gotoblas_SKYLAKEX gotoblas_HASWELL #define gotoblas_COOPERLAKE gotoblas_HASWELL +#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL #endif #endif #else @@ -279,6 +295,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_SKYLAKEX gotoblas_NEHALEM #define gotoblas_COOPERLAKE gotoblas_NEHALEM +#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA @@ -378,6 +395,31 @@ int support_avx512_bf16(){ #endif } +#define BIT_AMX_TILE 0x01000000 +#define BIT_AMX_BF16 0x00400000 +#define BIT_AMX_ENBD 0x00060000 + +int support_amx_bf16() { +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + // CPUID.7.0:EDX indicates AMX support + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); + if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { + // CPUID.D.0:EAX[17:18] indicates AMX enabled + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) + ret = 1; + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" @@ -689,6 +731,8 @@ static gotoblas_t *get_coretype(void){ } } if (model == 15){ // Sapphire Rapids + if(support_amx_bf16()) + return &gotoblas_SAPPHIRERAPIDS; if(support_avx512_bf16()) return &gotoblas_COOPERLAKE; if (support_avx512()) @@ -762,7 +806,8 @@ static gotoblas_t *get_coretype(void){ } return NULL; } - case 0xf: + break; + case 0xf: if (model <= 0x2) return &gotoblas_NORTHWOOD; return &gotoblas_PRESCOTT; } @@ -941,7 +986,8 @@ static char *corename[] = { "Excavator", "Zen", "SkylakeX", - "Cooperlake" + "Cooperlake", + "SapphireRapids" }; char *gotoblas_corename(void) { @@ -1006,6 +1052,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_ZEN) return corename[23]; if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; if (gotoblas == &gotoblas_COOPERLAKE) return corename[25]; + if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26]; return corename[0]; } diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 0f47b287c..6b21028d1 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023-2024 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; #else #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #endif +#ifdef DYN_ARMV8SVE +extern gotoblas_t gotoblas_ARMV8SVE; +#else +#define gotoblas_ARMV8SVE gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -116,10 +122,11 @@ extern gotoblas_t gotoblas_CORTEXA55; #endif #else extern gotoblas_t gotoblas_CORTEXA53; +#define gotoblas_CORTEXA55 gotoblas_CORTEXA53 extern gotoblas_t gotoblas_CORTEXA57; -extern gotoblas_t gotoblas_CORTEXA72; -extern gotoblas_t gotoblas_CORTEXA73; -extern gotoblas_t gotoblas_FALKOR; +#define gotoblas_CORTEXA72 gotoblas_CORTEXA57 +#define gotoblas_CORTEXA73 gotoblas_CORTEXA57 +#define gotoblas_FALKOR gotoblas_CORTEXA57 extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; @@ -128,17 +135,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #ifndef NO_SVE extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; +extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 +#define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; -extern gotoblas_t gotoblas_CORTEXA55; #endif +#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 extern void openblas_warning(int verbose, const char * msg); +#define FALLBACK_VERBOSE 1 +#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 13 +#define NUM_CORETYPES 17 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -147,6 +158,9 @@ extern void openblas_warning(int verbose, const char * msg); #ifndef HWCAP_CPUID #define HWCAP_CPUID (1 << 11) #endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif #define get_cpu_ftr(id, var) ({ \ __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ @@ -165,9 +179,11 @@ static char *corename[] = { "emag8180", "neoversen1", "neoversev1", + "neoversev2", "neoversen2", "thunderx3t110", "cortexa55", + "armv8sve", "unknown" }; @@ -184,9 +200,11 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11]; - if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; - if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; - if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; + if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12]; + if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14]; + if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; return corename[NUM_CORETYPES]; } @@ -218,9 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_NEOVERSEV1); - case 12: return (&gotoblas_NEOVERSEN2); - case 13: return (&gotoblas_THUNDERX3T110); - case 14: return (&gotoblas_CORTEXA55); + case 12: return (&gotoblas_NEOVERSEV2); + case 13: return (&gotoblas_NEOVERSEN2); + case 14: return (&gotoblas_THUNDERX3T110); + case 15: return (&gotoblas_CORTEXA55); + case 16: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -231,6 +251,10 @@ static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; char coremsg[128]; +#if defined (OS_DARWIN) + return &gotoblas_NEOVERSEN1; +#endif + #if (!defined OS_LINUX && !defined OS_ANDROID) return NULL; #else @@ -281,9 +305,24 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_NEOVERSEN1; #ifndef NO_SVE case 0xd49: - return &gotoblas_NEOVERSEN2; + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); + return &gotoblas_NEOVERSEN1; + } else + return &gotoblas_NEOVERSEN2; case 0xd40: - return &gotoblas_NEOVERSEV1; + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); + return &gotoblas_NEOVERSEN1; + }else + return &gotoblas_NEOVERSEV1; + case 0xd4f: + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); + return &gotoblas_NEOVERSEN1; + } else { + return &gotoblas_NEOVERSEV2; + } #endif case 0xd05: // Cortex A55 return &gotoblas_CORTEXA55; @@ -328,10 +367,19 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_FALKOR; } break; + case 0x61: // Apple + return &gotoblas_NEOVERSEN1; + break; default: snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } +#ifndef NO_SVE + if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { + return &gotoblas_ARMV8SVE; + } +#endif + return NULL; #endif } diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c index 52f8bcb2f..44de59669 100644 --- a/driver/others/dynamic_loongarch64.c +++ b/driver/others/dynamic_loongarch64.c @@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ +#include #include "common.h" extern gotoblas_t gotoblas_LOONGSON3R5; @@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) { return NULL; } -#define LASX_MASK 1<<7 -#define LSX_MASK 1<<6 -#define LOONGARCH_CFG2 0x02 +#define LA_HWCAP_LSX (1U << 4) +#define LA_HWCAP_LASX (1U << 5) static gotoblas_t *get_coretype(void) { - int ret = 0; - __asm__ volatile ( - "cpucfg %0, %1 \n\t" - : "+&r"(ret) - : "r"(LOONGARCH_CFG2) - ); - - if (ret & LASX_MASK) + int hwcap = (int)getauxval(AT_HWCAP); + + if (hwcap & LA_HWCAP_LASX) return &gotoblas_LOONGSON3R5; - else if (ret & LSX_MASK) + else if (hwcap & LA_HWCAP_LSX) return &gotoblas_LOONGSON2K1000; else return &gotoblas_LOONGSONGENERIC; diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 2847ea9ae..4c1f4a26e 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -3,7 +3,7 @@ extern gotoblas_t gotoblas_POWER6; extern gotoblas_t gotoblas_POWER8; -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__) extern gotoblas_t gotoblas_POWER9; #endif #ifdef HAVE_P10_SUPPORT @@ -20,14 +20,14 @@ static char *corename[] = { "POWER10" }; -#define NUM_CORETYPES 4 +#define NUM_CORETYPES 5 char *gotoblas_corename(void) { #ifndef C_PGI if (gotoblas == &gotoblas_POWER6) return corename[1]; #endif if (gotoblas == &gotoblas_POWER8) return corename[2]; -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__) if (gotoblas == &gotoblas_POWER9) return corename[3]; #endif #ifdef HAVE_P10_SUPPORT @@ -36,14 +36,44 @@ char *gotoblas_corename(void) { return corename[0]; } -#if defined(__clang__) -static int __builtin_cpu_supports(char* arg) -{ - return 0; -} +#define CPU_UNKNOWN 0 +#define CPU_POWER5 5 +#define CPU_POWER6 6 +#define CPU_POWER8 8 +#define CPU_POWER9 9 +#define CPU_POWER10 10 + +#ifndef POWER_9 +#define POWER_9 0x20000 /* 9 class CPU */ +#endif +#ifndef POWER_10 +#define POWER_10 0x40000 /* 10 class CPU */ #endif -#if defined(C_PGI) || defined(__clang__) +#ifdef _AIX +#include + +static int cpuid(void) +{ + int arch = _system_configuration.implementation; +#ifdef POWER_6 + if (arch == POWER_6) return CPU_POWER6; +#endif +#ifdef POWER_7 + else if (arch == POWER_7) return CPU_POWER6; +#endif +#ifdef POWER_8 + else if (arch == POWER_8) return CPU_POWER8; +#endif +#ifdef POWER_9 + else if (arch == POWER_9) return CPU_POWER9; +#endif +#ifdef POWER_10 + else if (arch >= POWER_10) return CPU_POWER10; +#endif + return CPU_UNKNOWN; +} +#elif defined(C_PGI) || defined(__clang__) /* * NV HPC compilers do not yet implement __builtin_cpu_is(). * Fake a version here for use in the CPU detection code below. @@ -53,21 +83,12 @@ static int __builtin_cpu_supports(char* arg) * what was requested. */ -#include - /* * Define POWER processor version table. * * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time */ -#define CPU_UNKNOWN 0 -#define CPU_POWER5 5 -#define CPU_POWER6 6 -#define CPU_POWER8 8 -#define CPU_POWER9 9 -#define CPU_POWER10 10 - static struct { uint32_t pvr_mask; uint32_t pvr_value; @@ -160,7 +181,8 @@ static struct { }, }; -static int __builtin_cpu_is(const char *cpu) { +static int cpuid(void) +{ int i; uint32_t pvr; uint32_t cpu_type; @@ -178,15 +200,54 @@ static int __builtin_cpu_is(const char *cpu) { pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); #endif cpu_type = pvrPOWER[i].cpu_type; + return (int)(cpu_type); +} +#elif !defined(__BUILTIN_CPU_SUPPORTS__) +static int cpuid(void) +{ + return CPU_UNKNOWN; +} +#endif /* _AIX */ + +#ifndef __BUILTIN_CPU_SUPPORTS__ +#include + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif - if (!strcmp(cpu, "power8")) - return cpu_type == CPU_POWER8; - if (!strcmp(cpu, "power9")) - return cpu_type == CPU_POWER9; - return 0; +#if defined(_AIX) || !__has_builtin(__builtin_cpu_is) +static int __builtin_cpu_is(const char *arg) +{ + static int ipinfo = -1; + if (ipinfo < 0) { + ipinfo = cpuid(); + } +#ifdef HAVE_P10_SUPPORT + if (ipinfo == CPU_POWER10) { + if (!strcmp(arg, "power10")) return 1; + } +#endif + if (ipinfo == CPU_POWER9) { + if (!strcmp(arg, "power9")) return 1; + } else if (ipinfo == CPU_POWER8) { + if (!strcmp(arg, "power8")) return 1; +#ifndef C_PGI + } else if (ipinfo == CPU_POWER6) { + if (!strcmp(arg, "power6")) return 1; +#endif + } + return 0; } +#endif -#endif /* C_PGI */ +#if defined(_AIX) || !__has_builtin(__builtin_cpu_supports) +static int __builtin_cpu_supports(const char *arg) +{ + return 0; +} +#endif +#endif static gotoblas_t *get_coretype(void) { @@ -196,19 +257,23 @@ static gotoblas_t *get_coretype(void) { #endif if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; #endif #ifdef HAVE_P10_SUPPORT +#if defined(_AIX) || defined(__clang__) + if (__builtin_cpu_is("power10")) +#else if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) +#endif return &gotoblas_POWER10; #endif /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) if (__builtin_cpu_is("power10")) return &gotoblas_POWER9; -#endif +#endif return NULL; } @@ -233,7 +298,7 @@ static gotoblas_t *force_coretype(char * coretype) { case 1: return (&gotoblas_POWER6); #endif case 2: return (&gotoblas_POWER8); -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__) case 3: return (&gotoblas_POWER9); #endif #ifdef HAVE_P10_SUPPORT @@ -274,6 +339,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas && gotoblas -> init) { strncpy(coren,gotoblas_corename(),20); sprintf(coremsg, "Core: %s\n",coren); + if (getenv("GET_OPENBLAS_CORETYPE")) { + fprintf(stderr, "%s", coremsg); + } openblas_warning(2, coremsg); gotoblas -> init(); } else { diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 5b45aae2f..dd26c8e80 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -13,7 +13,7 @@ extern gotoblas_t gotoblas_Z14; #define NUM_CORETYPES 4 -extern int openblas_verbose(); +extern int openblas_verbose(void); extern void openblas_warning(int verbose, const char* msg); char* gotoblas_corename(void) { diff --git a/driver/others/memory.c b/driver/others/memory.c index 4493b7d71..4ee8f9a2e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#define NEW_BUFFERS 512 #ifndef likely #ifdef __GNUC__ #define likely(x) __builtin_expect(!!(x), 1) @@ -422,15 +423,13 @@ This value is equal or large than blas_cpu_number. This means some threads are s */ int blas_num_threads = 0; -int blas_num_threads_set = 0; - int goto_get_num_procs (void) { return blas_cpu_number; } -static void blas_memory_init(); +static void blas_memory_init(void); -void openblas_fork_handler() +void openblas_fork_handler(void) { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is // built with "make USE_OPENMP=0". @@ -447,9 +446,9 @@ void openblas_fork_handler() #endif } -extern int openblas_num_threads_env(); -extern int openblas_goto_num_threads_env(); -extern int openblas_omp_num_threads_env(); +extern int openblas_num_threads_env(void); +extern int openblas_goto_num_threads_env(void); +extern int openblas_omp_num_threads_env(void); int blas_get_cpu_number(void){ #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) @@ -593,7 +592,7 @@ static BLASULONG key_lock = 0UL; #endif /* Returns a pointer to the start of the per-thread memory allocation data */ -static __inline struct alloc_t ** get_memory_table() { +static __inline struct alloc_t ** get_memory_table(void) { #if defined(SMP) LOCK_COMMAND(&key_lock); lsk=local_storage_key; @@ -1146,7 +1145,7 @@ static void blas_memory_cleanup(void* ptr){ } } -static void blas_memory_init(){ +static void blas_memory_init(void){ #if defined(SMP) # if defined(OS_WINDOWS) local_storage_key = TlsAlloc(); @@ -1503,7 +1502,7 @@ static void gotoblas_memory_init(void) { /* Initialization for all function; this function should be called before main */ static int gotoblas_initialized = 0; -extern void openblas_read_env(); +extern void openblas_read_env(void); void CONSTRUCTOR gotoblas_init(void) { @@ -1996,13 +1995,11 @@ This value is equal or large than blas_cpu_number. This means some threads are s */ int blas_num_threads = 0; -int blas_num_threads_set = 0; - int goto_get_num_procs (void) { return blas_cpu_number; } -void openblas_fork_handler() +void openblas_fork_handler(void) { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is // built with "make USE_OPENMP=0". @@ -2019,9 +2016,9 @@ void openblas_fork_handler() #endif } -extern int openblas_num_threads_env(); -extern int openblas_goto_num_threads_env(); -extern int openblas_omp_num_threads_env(); +extern int openblas_num_threads_env(void); +extern int openblas_goto_num_threads_env(void); +extern int openblas_omp_num_threads_env(void); int blas_get_cpu_number(void){ #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) @@ -2901,7 +2898,7 @@ void *blas_memory_alloc(int procpos){ #endif position ++; - } while (position < 512+NUM_BUFFERS); + } while (position < NEW_BUFFERS + NUM_BUFFERS); } #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -3015,10 +3012,13 @@ void *blas_memory_alloc(int procpos){ #endif if (memory_overflowed) goto terminate; fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); + fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n"); + fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", MAX_CPU_NUMBER); memory_overflowed=1; - new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); - newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); - for (i = 0; i < 512; i++) { + MB; + new_release_info = (struct release_t*) malloc(NEW_BUFFERS * sizeof(struct release_t)); + newmemory = (struct newmemstruct*) malloc(NEW_BUFFERS * sizeof(struct newmemstruct)); + for (i = 0; i < NEW_BUFFERS; i++) { newmemory[i].addr = (void *)0; #if defined(WHEREAMI) && !defined(USE_OPENMP) newmemory[i].pos = -1; @@ -3131,12 +3131,12 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) { - while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) + while ((position < NUM_BUFFERS+NEW_BUFFERS) && (newmemory[position-NUM_BUFFERS].addr != free_area)) position++; // arm: ensure all writes are finished before other thread takes this memory WMB; - - newmemory[position].used = 0; +if (position - NUM_BUFFERS >= NEW_BUFFERS) goto error; + newmemory[position-NUM_BUFFERS].used = 0; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -3214,14 +3214,18 @@ void blas_shutdown(void){ #endif memory[pos].lock = 0; } - if (memory_overflowed) - for (pos = 0; pos < 512; pos ++){ + if (memory_overflowed) { + for (pos = 0; pos < NEW_BUFFERS; pos ++){ newmemory[pos].addr = (void *)0; newmemory[pos].used = 0; #if defined(WHEREAMI) && !defined(USE_OPENMP) newmemory[pos].pos = -1; #endif newmemory[pos].lock = 0; + } + free(newmemory); + newmemory = NULL; + memory_overflowed = 0; } UNLOCK_COMMAND(&alloc_lock); @@ -3339,7 +3343,7 @@ static void gotoblas_memory_init(void) { /* Initialization for all function; this function should be called before main */ static int gotoblas_initialized = 0; -extern void openblas_read_env(); +extern void openblas_read_env(void); void CONSTRUCTOR gotoblas_init(void) { diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c index 0b38d1887..a2593e01f 100644 --- a/driver/others/memory_qalloc.c +++ b/driver/others/memory_qalloc.c @@ -283,13 +283,12 @@ The numbers of threads in the thread pool. This value is equal or large than blas_cpu_number. This means some threads are sleep. */ int blas_num_threads = 0; -int blas_num_threads_set = 0; int goto_get_num_procs (void) { return blas_cpu_number; } -void openblas_fork_handler() +void openblas_fork_handler(void) { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is // built with "make USE_OPENMP=0". @@ -306,9 +305,9 @@ void openblas_fork_handler() #endif } -extern int openblas_num_threads_env(); -extern int openblas_goto_num_threads_env(); -extern int openblas_omp_num_threads_env(); +extern int openblas_num_threads_env(void); +extern int openblas_goto_num_threads_env(void); +extern int openblas_omp_num_threads_env(void); int blas_get_cpu_number(void){ #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c index 35b2270d4..c65f0f320 100644 --- a/driver/others/openblas_env.c +++ b/driver/others/openblas_env.c @@ -41,15 +41,15 @@ static int openblas_env_goto_num_threads=0; static int openblas_env_omp_num_threads=0; static int openblas_env_omp_adaptive=0; -int openblas_verbose() { return openblas_env_verbose;} -unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;} -int openblas_block_factor() { return openblas_env_block_factor;} -int openblas_num_threads_env() { return openblas_env_openblas_num_threads;} -int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;} -int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;} -int openblas_omp_adaptive_env() { return openblas_env_omp_adaptive;} - -void openblas_read_env() { +int openblas_verbose(void) { return openblas_env_verbose;} +unsigned int openblas_thread_timeout(void) { return openblas_env_thread_timeout;} +int openblas_block_factor(void) { return openblas_env_block_factor;} +int openblas_num_threads_env(void) { return openblas_env_openblas_num_threads;} +int openblas_goto_num_threads_env(void) { return openblas_env_goto_num_threads;} +int openblas_omp_num_threads_env(void) { return openblas_env_omp_num_threads;} +int openblas_omp_adaptive_env(void) { return openblas_env_omp_adaptive;} + +void openblas_read_env(void) { int ret=0; env_var_t p; if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c index 9ac72c15d..aa0aa776a 100644 --- a/driver/others/openblas_error_handle.c +++ b/driver/others/openblas_error_handle.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -extern int openblas_verbose(); +extern int openblas_verbose(void); void openblas_warning(int verbose, const char * msg) { int current_verbose; diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 7a5cbeb62..867d0e361 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -69,13 +69,13 @@ static char* openblas_config_str="" ; #ifdef DYNAMIC_ARCH -char *gotoblas_corename(); +char *gotoblas_corename(void); #endif static char tmp_config_str[256]; -int openblas_get_parallel(); +int openblas_get_parallel(void); -char* CNAME() { +char* CNAME(void) { char tmpstr[20]; strcpy(tmp_config_str, openblas_config_str); #ifdef DYNAMIC_ARCH @@ -90,7 +90,7 @@ char tmpstr[20]; } -char* openblas_get_corename() { +char* openblas_get_corename(void) { #ifndef DYNAMIC_ARCH return CHAR_CORENAME; #else diff --git a/driver/others/openblas_get_parallel.c b/driver/others/openblas_get_parallel.c index 5dfda6e59..becfa0a3a 100644 --- a/driver/others/openblas_get_parallel.c +++ b/driver/others/openblas_get_parallel.c @@ -42,17 +42,17 @@ static int parallel = 0; #ifdef NEEDBUNDERSCORE -int CNAME() { +int CNAME(void) { return parallel; } -int NAME() { +int NAME(void) { return parallel; } #else //The CNAME and NAME are the same. -int NAME() { +int NAME(void) { return parallel; } #endif diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 0b57867b0..c938fde69 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef SMP_SERVER extern void openblas_set_num_threads(int num_threads) ; +extern int openblas_get_num_threads(void) ; void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); } +int openblas_set_num_threads_local(int num_threads){ + int ret = openblas_get_num_threads(); + openblas_set_num_threads(num_threads); + blas_omp_threads_local=num_threads; + return ret; +} + + #else //Single thread @@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) { void openblas_set_num_threads_(int* num_threads){ } + +int openblas_set_num_threads_local(int num_threads){ + return 1; +} #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 0d5c6aec0..de6bf0de4 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -40,7 +40,7 @@ #include #include "common.h" -extern int openblas_block_factor(); +extern int openblas_block_factor(void); int get_L2_size(void); #define DEFAULT_GEMM_P 128 diff --git a/exports/Makefile b/exports/Makefile index d81735342..27a291f34 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -73,6 +73,10 @@ endif endif endif +ifeq ($(F_COMPILER)$(OSNAME), IBMAIX) +EXTRALIB += -lxlf90 +endif + ifeq ($(C_COMPILER), PGI) EXTRALIB += -pgf90libs endif @@ -132,8 +136,12 @@ libgoto_hpl.def : $(GENSYM) ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) ifeq ($(OSNAME), Darwin) +ifeq ($(FIXED_LIBNAME),1) +INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib +else INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib endif +endif ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def @@ -169,8 +177,12 @@ INTERNALNAME = $(LIBPREFIX).so FEXTRALIB += -lm EXTRALIB += -lm else +ifeq ($(FIXED_LIBNAME),1) +INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so +else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif +endif ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ../$(LIBSONAME) : ../$(LIBNAME) linktest.c @@ -248,6 +260,20 @@ endif ifeq ($(OSNAME), AIX) +so : ../$(LIBSONAME) linktest.c + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK. + rm -f linktest + +../$(LIBSONAME) : aix.exp + $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB) + +aix.exp : + /usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \ + || ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \ + { if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \ + /usr/bin/sort -u > aix.exp + ifeq ($(COMPILER_F77), xlf) goto32.$(SUFFIX) : ../$(LIBNAME) aix.def @@ -272,23 +298,28 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) aix.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objconv.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c + ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c +ifeq ($(F_COMPILER), IBM) + mv linktest.c linktest.c.FIRST + egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c + rm linktest.c.FIRST +endif clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index f05de626f..226035842 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -21,7 +21,7 @@ blasobjsc=" chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2 csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm - ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum" + ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt" blasobjsd=" damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm @@ -29,7 +29,7 @@ blasobjsd=" dscal dsdot dspmv dspr2 dimatcopy domatcopy dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv - idamax idamin idmax idmin dgeadd dsum" + idamax idamin idmax idmin dgeadd dsum dgemmt" blasobjss=" isamax isamin ismax ismin @@ -38,7 +38,7 @@ blasobjss=" smax smin snrm2 simatcopy somatcopy srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv - strmm strmv strsm strsv sgeadd ssum" + strmm strmv strsm strsv sgeadd ssum sgemmt" blasobjsz=" izamax izamin @@ -48,7 +48,7 @@ blasobjsz=" zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv zomatcopy zimatcopy dzamax dzamin dzasum dznrm2 - zgeadd dzsum" + zgeadd dzsum zgemmt" blasobjs="lsame xerbla" bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" @@ -58,8 +58,9 @@ cblasobjsc=" cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv - cblas_scnrm2 cblas_scasum + cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy + cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin " cblasobjsd=" cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot @@ -67,8 +68,9 @@ cblasobjsd=" cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2 cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv - cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd + cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy + cblas_damax cblas_damin " cblasobjss=" @@ -78,8 +80,9 @@ cblasobjss=" cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm - cblas_strsv cblas_sgeadd + cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy + cblas_samax cblas_samin " cblasobjsz=" @@ -89,8 +92,9 @@ cblasobjsz=" cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub - cblas_zaxpby cblas_zgeadd + cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy + cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin " cblasobjs="cblas_xerbla" @@ -716,6 +720,7 @@ lapackobjs2z="$lapackobjs2z # functions added for lapack-3.7.0 lapackobjs2s="$lapackobjs2s slarfy + ssyconvf strevc3 sgelqt sgelqt3 @@ -843,6 +848,70 @@ lapackobjs2z="$lapackobjs2z zungtsqr_row " +#functions added for lapack-3.11 +lapackobjs2c="$lapackobjs2c + cgedmd + cgedmdq + " +lapackobjs2d="$lapackobjs2d + dgedmd + dgedmdq + " +lapackobjs2s="$lapackobjs2s + sgedmd + sgedmdq + " +lapackobjs2z="$lapackobjs2z + zgedmd + zgedmdq + " + +#functions added post 3.11 + +lapackobjs2c="$lapackobjs2c + claqp2rk + claqp3rk + ctrsyl3 + " +# claqz0 +# claqz1 +# claqz2 +# claqz3 +# clatrs3 + +lapackobjs2d="$lapackobjs2d + dgelqs + dgelst + dgeqp3rk + dgeqrs + dlaqp2rk + dlaqp3rk + dlarmm + dlatrs3 + dtrsyl3 + " +# dlaqz0 +# dlaqz1 +# dlaqz2 +# dlaqz3 +# dlaqz4 + +lapackobjs2z="$lapackobjs2z + zgelqs + zgelst + zgeqp3rk + zgeqrs + zlaqp2rk + zlaqp3rk + zlatrs3 + zrscl + ztrsyl3 + " +# zlaqz0 +# zlaqz1 +# zlaqz2 +# zlaqz3 + lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx @@ -1012,6 +1081,10 @@ lapackeobjsc=" LAPACKE_cgebrd_work LAPACKE_cgecon LAPACKE_cgecon_work + LAPACKE_cgedmd + LAPACKE_cgedmd_work + LAPACKE_cgedmdq + LAPACKE_cgedmdq_work LAPACKE_cgeequ LAPACKE_cgeequ_work LAPACKE_cgeequb @@ -1600,6 +1673,14 @@ lapackeobjsc=" LAPACKE_cgetsqrhrt_work LAPACKE_cungtsqr_row LAPACKE_cungtsqr_row_work + LAPACKE_clangb + LAPACKE_clangb_work + LAPACKE_ctrsyl3 + LAPACKE_ctrsyl3_work + LAPACKE_ctz_nancheck + LAPACKE_ctz_trans + LAPACKE_cunhr_col + LAPACKE_cunhr_col_work " lapackeobjsd=" @@ -1671,6 +1752,10 @@ lapackeobjsd=" LAPACKE_dgebrd_work LAPACKE_dgecon LAPACKE_dgecon_work + LAPACKE_dgedmd + LAPACKE_dgedmd_work + LAPACKE_dgedmdq + LAPACKE_dgedmdq_work LAPACKE_dgeequ LAPACKE_dgeequ_work LAPACKE_dgeequb @@ -2213,6 +2298,14 @@ lapackeobjsd=" LAPACKE_dgetsqrhrt_work LAPACKE_dorgtsqr_row LAPACKE_dorgtsqr_row_work + LAPACKE_dlangb + LAPACKE_dlangb_work + LAPACKE_dorhr_col + LAPACKE_dorhr_col_work + LAPACKE_dtrsyl3 + LAPACKE_dtrsyl3_work + LAPACKE_dtz_nancheck + LAPACKE_dtz_trans " lapackeobjss=" @@ -2284,6 +2377,10 @@ lapackeobjss=" LAPACKE_sgebrd_work LAPACKE_sgecon LAPACKE_sgecon_work + LAPACKE_sgedmd + LAPACKE_sgedmd_work + LAPACKE_sgedmdq + LAPACKE_sgedmdq_work LAPACKE_sgeequ LAPACKE_sgeequ_work LAPACKE_sgeequb @@ -2818,6 +2915,14 @@ lapackeobjss=" LAPACKE_sgetsqrhrt_work LAPACKE_sorgtsqr_row LAPACKE_sorgtsqr_row_work + LAPACKE_slangb + LAPACKE_slangb_work + LAPACKE_sorhr_col + LAPACKE_sorhr_col_work + LAPACKE_strsyl3 + LAPACKE_strsyl3_work + LAPACKE_stz_nancheck + LAPACKE_stz_trans " lapackeobjsz=" @@ -2893,6 +2998,10 @@ lapackeobjsz=" LAPACKE_zgebrd_work LAPACKE_zgecon LAPACKE_zgecon_work + LAPACKE_zgedmd + LAPACKE_zgedmd_work + LAPACKE_zgedmdq + LAPACKE_zgedmdq_work LAPACKE_zgeequ LAPACKE_zgeequ_work LAPACKE_zgeequb @@ -3481,6 +3590,14 @@ lapackeobjsz=" LAPACKE_zgetsqrhrt_work LAPACKE_zungtsqr_row LAPACKE_zungtsqr_row_work + LAPACKE_zlangb + LAPACKE_zlangb_work + LAPACKE_ztrsyl3 + LAPACKE_ztrsyl3_work + LAPACKE_ztz_nancheck + LAPACKE_ztz_trans + LAPACKE_zunhr_col + LAPACKE_zunhr_col_work " ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3582,6 +3699,7 @@ lapack_embeded_underscore_objs_s=" ssysv_aa_2stage ssytrf_aa_2stage ssytrs_aa_2stage slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col + slarfb_gett " lapack_embeded_underscore_objs_c=" chetf2_rook chetrf_rook chetri_rook @@ -3607,6 +3725,7 @@ lapack_embeded_underscore_objs_c=" csysv_aa_2stage csytrf_aa_2stage csytrs_aa_2stage claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col + clarfb_gett " lapack_embeded_underscore_objs_d=" dlasyf_rook @@ -3624,6 +3743,7 @@ lapack_embeded_underscore_objs_d=" dsysv_aa_2stage dsytrf_aa_2stage dsytrs_aa_2stage dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col + dlarfb_gett " lapack_embeded_underscore_objs_z=" zhetf2_rook zhetrf_rook zhetri_rook @@ -3648,6 +3768,7 @@ lapack_embeded_underscore_objs_z=" zhetrs_aa_2stage zsysv_aa_2stage zsytrf_aa_2stage zsytrs_aa_2stage zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col + zlarfb_gett " dirname=`pwd -P`/../lapack-netlib diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index e38a3cc89..dd79e924d 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -21,7 +21,7 @@ chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, - ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum); + ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); @blasobjsd = ( damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, @@ -29,7 +29,7 @@ dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, - idamax,idamin,idmax,idmin,dgeadd,dsum); + idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); @blasobjss = ( isamax,isamin,ismax,ismin, @@ -38,7 +38,7 @@ smax,smin,snrm2,simatcopy,somatcopy, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, - strmm,strmv,strsm,strsv, sgeadd,ssum); + strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); @blasobjsz = ( izamax,izamin,, @@ -48,7 +48,7 @@ zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, - zgeadd, dzsum); + zgeadd, dzsum, zgemmt); @blasobjs = (lsame, xerbla); @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @@ -60,7 +60,7 @@ cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, cblas_scnrm2, cblas_scasum, cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy - ); + cblas_cgemmt); @cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, @@ -69,7 +69,7 @@ cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy - ); + cblas_dgemmt); @cblasobjss = ( cblas_sasum, cblas_saxpy, cblas_saxpby, @@ -80,7 +80,7 @@ cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, cblas_strsv, cblas_sgeadd, cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy - ); + cblas_sgemmt); @cblasobjsz = ( cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, @@ -90,7 +90,7 @@ cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_zaxpby, cblas_zgeadd, cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy -); + cblas_zgemmt); @cblasobjs = ( cblas_xerbla ); diff --git a/f_check b/f_check index d071e016e..81f598ffa 100755 --- a/f_check +++ b/f_check @@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then pathf90 pathf95 pgf95 pgf90 pgf77 pgfortran nvfortran flang egfortran - ifort nagfor ifx ftn crayftn" + ifort nagfor ifx ftn crayftn armflang" for list in $lists; do for p in $path; do @@ -85,7 +85,11 @@ else *Hewlett*) vendor=CRAY openmp='-fopenmp' - ;; + ;; + *Arm\ F90*) + vendor=FLANG + openmp='-fopenmp' + ;; *GNU*|*GCC*) v="${data#*GCC: *\) }" @@ -101,7 +105,14 @@ else *flang*) vendor=FLANG openmp='-fopenmp' - ;; + data=`$compiler -v 2>&1 > /dev/null ` + v="${data#*version *}" + v="${v%%*.}" + major="${v%%.*}" + if [ "$major" -ge 17 ]; then + vendor=FLANGNEW + fi + ;; *ifort*|*ifx*) vendor=INTEL openmp='-fopenmp' @@ -110,6 +121,9 @@ else vendor=PGI openmp='-mp' ;; + *xlf*) + vendor=IBM + ;; *) vendor=G77 openmp='' @@ -148,6 +162,10 @@ else *'IBM XL'*) vendor=IBM openmp='-openmp' + case "$CC" in *gcc*) + bu=_ + ;; + esac ;; *NAG*) vendor=NAG @@ -216,6 +234,10 @@ else *ppuf*|*xlf*) vendor=IBM openmp='-openmp' + case "$CC" in *gcc*) + bu=_ + ;; + esac ;; *open64*) vendor=OPEN64 @@ -355,13 +377,6 @@ if [ -n "$link" ]; then ;; esac - case "$flag" in *-lgomp*) - case "$CC" in *clang*) - flag="-lomp" - ;; - esac - esac - case "$flag" in -l*) case "$flag" in *ibrary*|*gfortranbegin*|*flangmain*|*frtbegin*|*pathfstart*|\ diff --git a/getarch.c b/getarch.c index 937a8db68..2b5459a5f 100644 --- a/getarch.c +++ b/getarch.c @@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #endif -#if defined(AIX) +#if defined(_AIX) +#include +#include #include #endif @@ -150,6 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_EV4 */ /* #define FORCE_EV5 */ /* #define FORCE_EV6 */ +/* #define FORCE_CSKY */ +/* #define FORCE_CK860FV */ /* #define FORCE_GENERIC */ #ifdef FORCE_P2 @@ -1677,9 +1681,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "c910v" #define CORENAME "C910V" #endif +#endif +#ifdef FORCE_x280 +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "x280" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-Dx280 " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "x280" +#define CORENAME "x280" #else #endif +#ifdef FORCE_RISCV64_ZVL256B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL256B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL256B " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl256b" +#define CORENAME "RISCV64_ZVL256B" +#endif + +#ifdef FORCE_RISCV64_ZVL128B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL128B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL128B " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl128b" +#define CORENAME "RISCV64_ZVL128B" +#endif #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE @@ -1692,6 +1733,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "generic" #endif +#ifdef FORCE_CSKY +#define FORCE +#define ARCHITECTURE "CSKY" +#define SUBARCHITECTURE "CSKY" +#define SUBDIRNAME "csky" +#define ARCHCONFIG "-DCSKY" \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "csky" +#define CORENAME "CSKY" +#endif + +#ifdef FORCE_CK860FV +#define FORCE +#define ARCHITECTURE "CSKY" +#define SUBARCHITECTURE "CK860V" +#define SUBDIRNAME "csky" +#define ARCHCONFIG "-DCK860FV " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ck860fv" +#define CORENAME "CK860FV" +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1766,7 +1834,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif - #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif @@ -1805,11 +1872,13 @@ static int get_num_cores(void) { return count; -#elif defined(AIX) +#elif defined(_AIX) //returns the number of processors which are currently online count = sysconf(_SC_NPROCESSORS_ONLN); if (count <= 0) count = 2; - + + return count; + #else return 2; #endif @@ -1831,7 +1900,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1930,15 +1999,15 @@ printf("ELF_VERSION=2\n"); #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 - printf("MAKE += -j %d\n", MAKE_NB_JOBS); + printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS); #else // Let make use parent -j argument or -j1 if there // is no make parent #endif #elif NO_PARALLEL_MAKE==1 - printf("MAKE += -j 1\n"); + printf("MAKEFLAGS += -j 1\n"); #else - printf("MAKE += -j %d\n", get_num_cores()); + printf("MAKEFLAGS += -j %d\n", get_num_cores()); #endif break; @@ -1979,7 +2048,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 4e082928b..55374674a 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -119,6 +119,7 @@ endif () if (BUILD_BFLOAT16) GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") @@ -130,6 +131,8 @@ endif () foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) diff --git a/interface/Makefile b/interface/Makefile index 2ac9663d6..048d679d6 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ + cblas_samin.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ + cblas_damin.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) \ + cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ @@ -340,12 +342,12 @@ CXERBLAOBJ = \ CZBLAS1OBJS = \ cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ - cblas_zcopy.$(SUFFIX) \ + cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) \ + cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) @@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) -sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h +sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) +cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) -cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif diff --git a/interface/geadd.c b/interface/geadd.c index f0befa14a..a2e6d1081 100644 --- a/interface/geadd.c +++ b/interface/geadd.c @@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, info = 0; - if (lda < MAX(1, m)) info = 6; + if (lda < MAX(1, m)) info = 5; if (ldc < MAX(1, m)) info = 8; if (n < 0) info = 2; @@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order, if (ldc < MAX(1, m)) info = 8; if (lda < MAX(1, m)) info = 5; - if (n < 0) info = 2; - if (m < 0) info = 1; + if (n < 0) info = 1; + if (m < 0) info = 2; } if (info >= 0) { diff --git a/interface/gemm.c b/interface/gemm.c index 71cc77a1b..4778b641b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = { #endif #endif +#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) +#define XFEATURE_XTILEDATA 18 +#define ARCH_REQ_XCOMP_PERM 0x1023 +static int openblas_amxtile_permission = 0; +static int init_amxtile_permission() { + long status = + syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); + if (status != 0) { + fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, " + "Intel Sapphier Rapids), skip sbgemm calculation\n"); + return -1; + } + openblas_amxtile_permission = 1; + return 0; +} +#endif + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif +#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) +#if defined(DYNAMIC_ARCH) + if (gotoblas->need_amxtile_permission && + openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { + return; + } +#endif +#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS) + if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { + return; + } +#endif +#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) + if ((args.m == 0) || (args.n == 0)) return; #if 0 diff --git a/interface/gemmt.c b/interface/gemmt.c index 3eed1dfe4..cae00877e 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -35,29 +35,26 @@ #include #include #include "common.h" -#ifdef FUNCTION_PROFILE -#include "functable.h" -#endif #ifndef COMPLEX #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE -#define ERROR_NAME "QGEMT " +#define ERROR_NAME "QGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "DGEMT " +#define ERROR_NAME "DGEMMT " #elif defined(BFLOAT16) -#define ERROR_NAME "SBGEMT " +#define ERROR_NAME "SBGEMMT " #else -#define ERROR_NAME "SGEMT " +#define ERROR_NAME "SGEMMT " #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifdef XDOUBLE -#define ERROR_NAME "XGEMT " +#define ERROR_NAME "XGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "ZGEMT " +#define ERROR_NAME "ZGEMMT " #else -#define ERROR_NAME "CGEMT " +#define ERROR_NAME "CGEMMT " #endif #endif @@ -68,18 +65,22 @@ #ifndef CBLAS void NAME(char *UPLO, char *TRANSA, char *TRANSB, - blasint * M, blasint * N, blasint * K, + blasint * M, blasint * K, FLOAT * Alpha, IFLOAT * a, blasint * ldA, IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) { - blasint m, n, k; + blasint m, k; blasint lda, ldb, ldc; int transa, transb, uplo; blasint info; char transA, transB, Uplo; + blasint nrowa, nrowb; +#if defined(COMPLEX) + blasint ncolb; +#endif IFLOAT *buffer; IFLOAT *aa, *bb; FLOAT *cc; @@ -92,7 +93,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, PRINT_DEBUG_NAME; m = *M; - n = *N; k = *K; #if defined(COMPLEX) @@ -158,33 +158,48 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, uplo = 0; if (Uplo == 'L') uplo = 1; + + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } info = 0; - if (uplo < 0) - info = 14; - if (ldc < m) + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; - if (info) { + if (info != 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, - blasint N, blasint k, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, #ifndef COMPLEX FLOAT alpha, IFLOAT * A, blasint LDA, @@ -205,17 +220,23 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int transa, transb, uplo; blasint info; - blasint m, n, lda, ldb; + blasint lda, ldb; FLOAT *a, *b; +#if defined(COMPLEX) + blasint nrowb, ncolb; +#endif XFLOAT *buffer; PRINT_DEBUG_CNAME; + uplo = -1; transa = -1; transb = -1; info = 0; if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) transa = 0; @@ -248,9 +269,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, transb = 3; #endif - m = M; - n = N; - a = (void *)A; b = (void *)B; lda = LDA; @@ -258,23 +276,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint nrowa; +#if !defined(COMPLEX) + blasint nrowb; +#endif + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } + + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; } if (order == CblasRowMajor) { - m = N; - n = M; a = (void *)B; b = (void *)A; @@ -282,6 +319,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, lda = LDB; ldb = LDA; + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + if (TransB == CblasNoTrans) transa = 0; if (TransB == CblasTrans) @@ -315,29 +355,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint ncola; +#if !defined(COMPLEX) + blasint ncolb; +#endif + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; +#if defined(COMPLEX) + nrowb = m; +#endif + + if (transb & 1) { +#if defined(COMPLEX) + nrowb = k; +#endif + ncolb = m; + } + + if (ldc < MAX(1,m)) info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) info = 2; if (transa < 0) + info = 3; + if (uplo < 0) info = 1; - } - uplo = -1; - if (Uplo == CblasUpper) - uplo = 0; - if (Uplo == CblasLower) - uplo = 1; - if (uplo < 0) - info = 14; - if (info >= 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; @@ -407,37 +460,48 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif - if ((m == 0) || (n == 0)) + if (m == 0) return; IDEBUG_START; - FUNCTION_PROFILE_START(); +#if defined(COMPLEX) + if (transb > 1){ +#ifndef CBLAS + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#else + if (order == CblasColMajor) + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); + if (order == CblasRowMajor) + IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#endif + } +#endif - const blasint incb = (transb == 0) ? 1 : ldb; + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; if (uplo == 1) { - for (i = 0; i < n; i++) { - j = n - i; + for (i = 0; i < m; i++) { + j = m - i; l = j; #if defined(COMPLEX) aa = a + i * 2; bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transa & 1) { aa = a + lda * i * 2; - bb = b + i * 2; } + if (transb & 1) + bb = b + i * 2; cc = c + i * 2 * ldc + i * 2; #else aa = a + i; bb = b + i * ldb; - if (transa) { - l = k; + if (transa & 1) { aa = a + lda * i; - bb = b + i; } + if (transb & 1) + bb = b + i; cc = c + i * ldc + i; #endif @@ -447,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -458,15 +522,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_START; - FUNCTION_PROFILE_START(); - - buffer_size = j + k + 128 / sizeof(FLOAT); + buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); #endif // for alignment buffer_size = (buffer_size + 3) & ~3; - STACK_ALLOC(buffer_size, FLOAT, buffer); + STACK_ALLOC(buffer_size, IFLOAT, buffer); #ifdef SMP @@ -479,20 +541,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); #else + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, aa, lda, bb, incb, cc, 1, buffer, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); } #endif @@ -501,21 +577,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } } else { - for (i = 0; i < n; i++) { + for (i = 0; i < m; i++) { j = i + 1; l = j; #if defined COMPLEX bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transb & 1) { bb = b + i * 2; } cc = c + i * 2 * ldc; #else bb = b + i * ldb; - if (transa) { - l = k; + if (transb & 1) { bb = b + i; } cc = c + i * ldc; @@ -527,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -537,15 +611,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif IDEBUG_START; - FUNCTION_PROFILE_START(); - - buffer_size = j + k + 128 / sizeof(FLOAT); + buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); #endif // for alignment buffer_size = (buffer_size + 3) & ~3; - STACK_ALLOC(buffer_size, FLOAT, buffer); + STACK_ALLOC(buffer_size, IFLOAT, buffer); #ifdef SMP @@ -558,30 +630,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); #else + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, a, lda, bb, incb, cc, 1, buffer, nthreads); - + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); } #endif STACK_FREE(buffer); } } - FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, - args.m * args.k + args.k * args.n + - args.m * args.n, 2 * args.m * args.n * args.k); IDEBUG_END; diff --git a/interface/gemv.c b/interface/gemv.c index 1f0763579..2c121f130 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) + if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) nthreads = 1; else nthreads = num_cpu_avail(2); diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 91975f7f4..69876e31e 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -100,37 +100,42 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; - if ( trans == BlasTrans && *ldb < *cols ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*rows) ) info = 8; + if ( trans == BlasTrans && *ldb < MAX(1,*cols) ) info = 8; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; - if ( trans == BlasTrans && *ldb < *rows ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*cols) ) info = 8; + if ( trans == BlasTrans && *ldb < MAX(1,*rows) ) info = 8; } - if ( order == BlasColMajor && *lda < *rows ) info = 7; - if ( order == BlasRowMajor && *lda < *cols ) info = 7; - if ( *cols <= 0 ) info = 4; - if ( *rows <= 0 ) info = 3; - if ( trans < 0 ) info = 2; - if ( order < 0 ) info = 1; + if ( order == BlasColMajor && *lda < MAX(1,*rows) ) info = 7; + if ( order == BlasRowMajor && *lda < MAX(1,*cols) ) info = 7; + if ( *cols < 0 ) info = 4; + if ( *rows < 0 ) info = 3; + if ( trans < 0 ) info = 2; + if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + + if ((*rows == 0) || (*cols == 0)) return; + #ifdef NEW_IMATCOPY - if ( *lda == *ldb && *rows == *cols) { + if ( *lda == *ldb ) { if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda ); + return; } - else + else if ( *rows == *cols ) { IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda ); + return; } } else @@ -138,26 +143,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( trans == BlasNoTrans ) { IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda ); + return; } - else + else if ( *rows == *cols ) { IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda ); + return; } } - return; } - #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); - else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); + else + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) { - printf("Memory alloc failed\n"); + printf("Memory alloc failed in imatcopy\n"); exit(1); } @@ -165,26 +170,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, { if ( trans == BlasNoTrans ) { - OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb ); - OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb ); + OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows ); + OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb ); } else { - OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb ); - OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb ); + OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols ); + OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb ); } } else { if ( trans == BlasNoTrans ) { - OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb ); - OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); + OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols ); + OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb ); } else { - OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb ); - OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb ); + OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows ); + OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb ); } } diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 175350329..546c2bed2 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c index 0dde33ae3..6544dbc5b 100644 --- a/interface/lapack/laswp.c +++ b/interface/lapack/laswp.c @@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, - (int(*)())laswp[flag], nthreads); + (int(*)(void))laswp[flag], nthreads); } #endif diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c index b77a40985..7bb4a659e 100644 --- a/interface/lapack/zlaswp.c +++ b/interface/lapack/zlaswp.c @@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads); + blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)(void))laswp[flag], nthreads); } #endif diff --git a/interface/max.c b/interface/max.c index f05977448..7817601b9 100644 --- a/interface/max.c +++ b/interface/max.c @@ -46,6 +46,12 @@ #ifdef USE_ABS +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + #ifndef USE_MIN /* ABS & MAX */ @@ -92,6 +98,8 @@ #else +#define ABS + #ifndef USE_MIN /* MAX */ @@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ if (n <= 0) return 0; +#ifndef COMPLEX + if (incx == 0) return (ABS(*x)); +#else + if (incx == 0) return (ABS(*x) + ABS(*(x+1))); +#endif + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -145,14 +159,25 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else +#ifdef COMPLEX +FLOAT CNAME(blasint n, void *vx, blasint incx){ + FLOAT *x = (FLOAT*) vx; +#else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ - +#endif + FLOAT ret; PRINT_DEBUG_CNAME; if (n <= 0) return 0; +#ifndef COMPLEX + if (incx == 0) return (ABS(*x)); +#else + if (incx == 0) return (ABS(*x) + ABS(*(x+1))); +#endif + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/nrm2.c b/interface/nrm2.c index dc8c08e9a..331ebc3d0 100644 --- a/interface/nrm2.c +++ b/interface/nrm2.c @@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ if (n <= 0) return 0.; +#ifndef COMPLEX + if (n == 1) +#ifdef DOUBLE + return fabs(x[0]); +#else + return fabsf(x[0]); +#endif +#endif + + if (incx < 0) +#ifdef COMPLEX + x -= (n - 1) * incx * 2; +#else + x -= (n - 1) * incx; +#endif IDEBUG_START; FUNCTION_PROFILE_START(); @@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ if (n <= 0) return 0.; + #ifndef COMPLEX + if (n == 1) +#ifdef DOUBLE + return fabs(x[0]); +#else + return fabsf(x[0]); +#endif +#endif + + if (incx < 0) +#ifdef COMPLEX + x -= (n - 1) * incx * 2; +#else + x -= (n - 1) * incx; +#endif + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/omatcopy.c b/interface/omatcopy.c index 59650cfa0..c26446f5c 100644 --- a/interface/omatcopy.c +++ b/interface/omatcopy.c @@ -90,27 +90,29 @@ void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; - if ( trans == BlasTrans && *ldb < *cols ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*cols) ) info = 9; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; - if ( trans == BlasTrans && *ldb < *rows ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*rows) ) info = 9; } - if ( order == BlasColMajor && *lda < *rows ) info = 7; - if ( order == BlasRowMajor && *lda < *cols ) info = 7; - if ( *cols <= 0 ) info = 4; - if ( *rows <= 0 ) info = 3; - if ( trans < 0 ) info = 2; - if ( order < 0 ) info = 1; + if ( order == BlasColMajor && *lda < MAX(1,*rows) ) info = 7; + if ( order == BlasRowMajor && *lda < MAX(1,*cols) ) info = 7; + if ( *cols < 0 ) info = 4; + if ( *rows < 0 ) info = 3; + if ( trans < 0 ) info = 2; + if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + if ((*rows == 0) || (*cols == 0)) return; + if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) diff --git a/interface/rotg.c b/interface/rotg.c index 69443a5a0..423ebda21 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -1,9 +1,11 @@ #include +#include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif + #ifndef CBLAS void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ @@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ #endif +#ifdef DOUBLE + long double safmin = DBL_MIN; +#else + long double safmin = FLT_MIN; +#endif + #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da = *DA; long double db = *DB; long double c; long double s; - long double r, roe, z; + long double r, z; + long double sigma, dascal,dbscal; long double ada = fabsl(da); long double adb = fabsl(db); - long double scale = ada + adb; + long double maxab = MAX(ada,adb); + long double safmax; + long double scale; + #ifndef CBLAS PRINT_DEBUG_NAME; @@ -32,27 +44,30 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ PRINT_DEBUG_CNAME; #endif - roe = db; - if (ada > adb) roe = da; - - if (scale == ZERO) { + if (adb == ZERO) { *C = ONE; *S = ZERO; - *DA = ZERO; *DB = ZERO; + } else if (ada == ZERO) { + *C = ZERO; + *S = ONE; + *DA = *DB; + *DB = ONE; } else { - r = sqrt(da * da + db * db); - if (roe < 0) r = -r; + safmax = 1./safmin; + scale = MIN(MAX(safmin,maxab), safmax); + if (ada > adb) + sigma = copysign(1.,da); + else + sigma = copysign(1.,db); + dascal = da / scale; + dbscal = db / scale; + r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal)); c = da / r; s = db / r; z = ONE; - if (da != ZERO) { - if (ada > adb){ - z = s; - } else { - z = ONE / c; - } - } + if (ada > adb) z = s; + if ((ada <= adb) && (c != ZERO)) z = ONE / c; *C = c; *S = s; @@ -65,11 +80,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ FLOAT db = *DB; FLOAT c = *C; FLOAT s = *S; - FLOAT r, roe, z; + FLOAT sigma; + FLOAT r, z; FLOAT ada = fabs(da); FLOAT adb = fabs(db); - FLOAT scale = ada + adb; + FLOAT maxab = MAX(ada,adb); + long double safmax ; + FLOAT scale ; + + safmax = 1./safmin; + scale = MIN(MAX(safmin,maxab), safmax); + + if (ada > adb) + sigma = copysign(1.,da); + else + sigma = copysign(1.,db); #ifndef CBLAS PRINT_DEBUG_NAME; @@ -77,20 +103,21 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ PRINT_DEBUG_CNAME; #endif - roe = db; - if (ada > adb) roe = da; - if (scale == ZERO) { + if (adb == ZERO) { *C = ONE; *S = ZERO; - *DA = ZERO; *DB = ZERO; + } else if (ada == ZERO) { + *C = ZERO; + *S = ONE; + *DA = *DB; + *DB = ONE; } else { FLOAT aa = da / scale; FLOAT bb = db / scale; - r = scale * sqrt(aa * aa + bb * bb); - if (roe < 0) r = -r; + r = sigma * scale * sqrt(aa * aa + bb * bb); c = da / r; s = db / r; z = ONE; diff --git a/interface/rotmg.c b/interface/rotmg.c index 3a5ca8f95..b8f627221 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ else { dp2 = *dd2 * dy1; - if(dp2 == ZERO) - { - dflag = -TWO; - dparam[0] = dflag; - return; - } dp1 = *dd1 * *dx1; dq2 = dp2 * dy1; dq1 = dp1 * *dx1; @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dh12 = dp2 / dp1; du = ONE - dh12 * dh21; - if(du > ZERO) - { - dflag = ZERO; - *dd1 = *dd1 / du; - *dd2 = *dd2 / du; - *dx1 = *dx1 * du; - } else { - dflag = -ONE; - - dh11 = ZERO; - dh12 = ZERO; - dh21 = ZERO; - dh22 = ZERO; - - *dd1 = ZERO; - *dd2 = ZERO; - *dx1 = ZERO; - } + dflag = ZERO; + *dd1 = *dd1 / du; + *dd2 = *dd2 / du; + *dx1 = *dx1 * du; } else diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c new file mode 100644 index 000000000..759af4bfb --- /dev/null +++ b/interface/sbgemmt.c @@ -0,0 +1,447 @@ +/*********************************************************************/ +/* Copyright 2024, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#define SMP_THRESHOLD_MIN 65536.0 +#define ERROR_NAME "SBGEMMT " + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + blasint nrowa, nrowb; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; + FLOAT alpha, beta; + + PRINT_DEBUG_NAME; + + m = *M; + k = *K; + + alpha = *Alpha; + beta = *Beta; + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; + + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; + + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + info = 0; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + + if (info != 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ + IFLOAT *aa, *bb; + FLOAT *cc; + + int transa, transb, uplo; + blasint info; + blasint lda, ldb; + IFLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + uplo = -1; + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; + + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; + + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; + + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + blasint nrowa; + blasint nrowb; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + } + + if (order == CblasRowMajor) { + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; + + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; + + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; + + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; + + info = -1; + + blasint ncola; + blasint ncolb; + + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; + + if (transb & 1) { + ncolb = m; + } + + if (ldc < MAX(1,m)) + info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 2; + if (transa < 0) + info = 3; + if (uplo < 0) + info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + int buffer_size; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, + BLASLONG, IFLOAT *, BLASLONG, FLOAT, + FLOAT *, BLASLONG, int) = { + sbgemv_thread_n, sbgemv_thread_t, + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, + IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { + SBGEMV_N, SBGEMV_T,}; + + + if (m == 0) + return; + + IDEBUG_START; + + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < m; i++) { + j = m - i; + + aa = a + i; + bb = b + i * ldb; + if (transa & 1) { + aa = a + lda * i; + } + if (transb & 1) + bb = b + i; + cc = c + i * ldc + i; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, aa, lda, + bb, incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, aa, lda, + bb, incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < m; i++) { + j = i + 1; + + bb = b + i * ldb; + if (transb & 1) { + bb = b + i; + } + cc = c + i * ldc; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, a, lda, bb, + incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, a, lda, bb, + incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + } +#endif + + STACK_FREE(buffer); + } + } + + IDEBUG_END; + + return; +} diff --git a/interface/symm.c b/interface/symm.c index 3e65e69b1..3e6e0fd48 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO, int nodes; #endif # if defined(SMP) - int MN; + double MN; #endif blasint info; int side; @@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, int nodes; #endif #if defined(SMP) - int MN; + double MN; #endif PRINT_DEBUG_CNAME; diff --git a/interface/syrk.c b/interface/syrk.c index 3b056aec8..69f2328a4 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, FLOAT *sa, *sb; #ifdef SMP - int NNK; + double NNK; #ifdef USE_SIMPLE_THREADED_LEVEL3 #ifndef COMPLEX #ifdef XDOUBLE @@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr FLOAT *sa, *sb; #ifdef SMP -int NNK; +double NNK; #ifdef USE_SIMPLE_THREADED_LEVEL3 #ifndef COMPLEX diff --git a/interface/zaxpby.c b/interface/zaxpby.c index 3a4db7403..e5065270d 100644 --- a/interface/zaxpby.c +++ b/interface/zaxpby.c @@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef CBLAS -void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) +void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY) { blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; + FLOAT* ALPHA = (FLOAT*) VALPHA; + FLOAT* BETA = (FLOAT*) VBETA; #else diff --git a/interface/zgeadd.c b/interface/zgeadd.c index 7124cf230..de71f27b8 100644 --- a/interface/zgeadd.c +++ b/interface/zgeadd.c @@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, info = 0; - if (lda < MAX(1, m)) info = 6; + if (lda < MAX(1, m)) info = 5; if (ldc < MAX(1, m)) info = 8; if (n < 0) info = 2; @@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order, if (ldc < MAX(1, m)) info = 8; if (lda < MAX(1, m)) info = 5; - if (n < 0) info = 2; - if (m < 0) info = 1; + if (n < 0) info = 1; + if (m < 0) info = 2; } if (info >= 0) { diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index ecda5ef4e..b879c9ac2 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -101,51 +101,59 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; - if ( trans == BlasConj && *ldb < *rows ) info = 9; - if ( trans == BlasTrans && *ldb < *cols ) info = 9; - if ( trans == BlasTransConj && *ldb < *cols ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasConj && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTransConj && *ldb < MAX(1,*cols) ) info = 9; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; - if ( trans == BlasConj && *ldb < *cols ) info = 9; - if ( trans == BlasTrans && *ldb < *rows ) info = 9; - if ( trans == BlasTransConj && *ldb < *rows ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasConj && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTransConj && *ldb < MAX(1,*rows) ) info = 9; } - if ( order == BlasColMajor && *lda < *rows ) info = 7; - if ( order == BlasRowMajor && *lda < *cols ) info = 7; - if ( *cols <= 0 ) info = 4; - if ( *rows <= 0 ) info = 3; - if ( trans < 0 ) info = 2; - if ( order < 0 ) info = 1; + if ( order == BlasColMajor && *lda < MAX(1,*rows) ) info = 7; + if ( order == BlasRowMajor && *lda < MAX(1,*cols) ) info = 7; + if ( *cols < 0 ) info = 4; + if ( *rows < 0 ) info = 3; + if ( trans < 0 ) info = 2; + if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + + if ((*rows == 0) || (*cols == 0)) return; + #ifdef NEW_IMATCOPY - if (*lda == *ldb && *cols == *rows) { + if (*lda == *ldb ) { if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } if ( trans == BlasConj ) { IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } - if ( trans == BlasTrans ) + if ( trans == BlasTrans && *rows == *cols ) { IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } - if ( trans == BlasTransConj ) + if ( trans == BlasTransConj && *rows == *cols ) { IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } + } else { @@ -153,67 +161,62 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( trans == BlasNoTrans ) { IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } if ( trans == BlasConj ) { IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } - if ( trans == BlasTrans ) + if ( trans == BlasTrans && *rows == *cols ) { IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } - if ( trans == BlasTransConj ) + if ( trans == BlasTransConj && *rows == *cols ) { IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } + } - return; } #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; - - b = malloc(msize); - if ( b == NULL ) - { - printf("Memory alloc failed in zimatcopy\n"); - exit(1); - } + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; + b = malloc(msize); + if ( b == NULL ) + { + printf("Memory alloc failed in zimatcopy\n"); + exit(1); + } if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { - OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); + OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } - if ( trans == BlasConj ) + else if ( trans == BlasConj ) { - OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); + OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } - if ( trans == BlasTrans ) + else if ( trans == BlasTrans ) { - OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); + OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } - if ( trans == BlasTransConj ) + else if ( trans == BlasTransConj ) { - OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); + OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } } @@ -222,34 +225,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( trans == BlasNoTrans ) { - OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); + OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } - if ( trans == BlasConj ) + else if ( trans == BlasConj ) { - OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); + OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } - if ( trans == BlasTrans ) + else if ( trans == BlasTrans ) { - OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); + OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } - if ( trans == BlasTransConj ) + else if ( trans == BlasTransConj ) { - OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); + OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } } + free(b); return; diff --git a/interface/zomatcopy.c b/interface/zomatcopy.c index 7345633a2..7121711d8 100644 --- a/interface/zomatcopy.c +++ b/interface/zomatcopy.c @@ -92,31 +92,33 @@ void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; - if ( trans == BlasConj && *ldb < *rows ) info = 9; - if ( trans == BlasTrans && *ldb < *cols ) info = 9; - if ( trans == BlasTransConj && *ldb < *cols ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasConj && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTransConj && *ldb < MAX(1,*cols) ) info = 9; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; - if ( trans == BlasConj && *ldb < *cols ) info = 9; - if ( trans == BlasTrans && *ldb < *rows ) info = 9; - if ( trans == BlasTransConj && *ldb < *rows ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasConj && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTransConj && *ldb < MAX(1,*rows) ) info = 9; } - if ( order == BlasColMajor && *lda < *rows ) info = 7; - if ( order == BlasRowMajor && *lda < *cols ) info = 7; - if ( *cols <= 0 ) info = 4; - if ( *rows <= 0 ) info = 3; - if ( trans < 0 ) info = 2; - if ( order < 0 ) info = 1; + if ( order == BlasColMajor && *lda < MAX(1,*rows) ) info = 7; + if ( order == BlasRowMajor && *lda < MAX(1,*cols) ) info = 7; + if ( *cols < 0 ) info = 4; + if ( *rows < 0 ) info = 3; + if ( trans < 0 ) info = 2; + if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + if ((*rows == 0) || (*cols == 0)) return; + if ( order == BlasColMajor ) { diff --git a/interface/zrotg.c b/interface/zrotg.c index 123f4da85..8acc3c9b2 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -1,9 +1,11 @@ #include +#include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif + #ifndef CBLAS void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ @@ -14,53 +16,26 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { FLOAT *S = (FLOAT*) VS; #endif /* CBLAS */ -#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) - - long double da_r = *(DA + 0); - long double da_i = *(DA + 1); - long double db_r = *(DB + 0); - long double db_i = *(DB + 1); - long double r; - - long double ada = fabsl(da_r) + fabsl(da_i); - - PRINT_DEBUG_NAME; - - IDEBUG_START; - - FUNCTION_PROFILE_START(); - - if (ada == ZERO) { - *C = ZERO; - *(S + 0) = ONE; - *(S + 1) = ZERO; - *(DA + 0) = db_r; - *(DA + 1) = db_i; - } else { - long double alpha_r, alpha_i; - - ada = sqrt(da_r * da_r + da_i * da_i); - - r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i); +#ifdef DOUBLE + long double safmin = DBL_MIN; + long double rtmin = sqrt(DBL_MIN/DBL_EPSILON); +#else + long double safmin = FLT_MIN; + long double rtmin = sqrt(FLT_MIN/FLT_EPSILON); +#endif - alpha_r = da_r / ada; - alpha_i = da_i / ada; - *(C + 0) = ada / r; - *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; - *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; - *(DA + 0) = alpha_r * r; - *(DA + 1) = alpha_i * r; - } -#else - FLOAT da_r = *(DA + 0); - FLOAT da_i = *(DA + 1); - FLOAT db_r = *(DB + 0); - FLOAT db_i = *(DB + 1); - FLOAT r; + FLOAT da_r = *(DA+0); + FLOAT da_i = *(DA+1); + FLOAT db_r = *(DB+0); + FLOAT db_i = *(DB+1); + //long double r; + FLOAT S1[2]; + FLOAT R[2]; + long double d; - FLOAT ada = fabs(da_r) + fabs(da_i); - FLOAT adb; + FLOAT ada = da_r * da_r + da_i * da_i; + FLOAT adb = db_r * db_r + db_i * db_i; PRINT_DEBUG_NAME; @@ -68,69 +43,140 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { FUNCTION_PROFILE_START(); - if (ada == ZERO) { - *C = ZERO; - *(S + 0) = ONE; + if (db_r == ZERO && db_i == ZERO) { + *C = ONE; + *(S + 0) = ZERO; *(S + 1) = ZERO; - *(DA + 0) = db_r; - *(DA + 1) = db_i; - } else { - FLOAT scale; - FLOAT aa_r, aa_i, bb_r, bb_i; - FLOAT alpha_r, alpha_i; - - aa_r = fabs(da_r); - aa_i = fabs(da_i); - - if (aa_i > aa_r) { - aa_r = fabs(da_i); - aa_i = fabs(da_r); - } - - if (aa_r == ZERO) { - ada = 0.; - } else { - scale = (aa_i / aa_r); - ada = aa_r * sqrt(ONE + scale * scale); - } - - bb_r = fabs(db_r); - bb_i = fabs(db_i); - - if (bb_i > bb_r) { - bb_r = fabs(bb_i); - bb_i = fabs(bb_r); - } - - if (bb_r == ZERO) { - adb = 0.; - } else { - scale = (bb_i / bb_r); - adb = bb_r * sqrt(ONE + scale * scale); - } - scale = ada + adb; - - aa_r = da_r / scale; - aa_i = da_i / scale; - bb_r = db_r / scale; - bb_i = db_i / scale; - - r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); - - alpha_r = da_r / ada; - alpha_i = da_i / ada; - - *(C + 0) = ada / r; - *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; - *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; - *(DA + 0) = alpha_r * r; - *(DA + 1) = alpha_i * r; + return; } -#endif - FUNCTION_PROFILE_END(4, 4, 4); - - IDEBUG_END; - - return; + long double safmax = 1./safmin; +#if defined DOUBLE + long double rtmax = safmax /DBL_EPSILON; +#else + long double rtmax = safmax /FLT_EPSILON; +#endif + *(S1 + 0) = *(DB + 0); + *(S1 + 1) = *(DB + 1) *-1; + if (da_r == ZERO && da_i == ZERO) { + *C = ZERO; + if (db_r == ZERO) { + (*DA) = fabsl(db_i); + *S = *S1 /(*DA); + *(S+1) = *(S1+1) /(*DA); + return; + } else if ( db_i == ZERO) { + *DA = fabsl(db_r); + *S = *S1 /(*DA); + *(S+1) = *(S1+1) /(*DA); + return; + } else { + long double g1 = MAX( fabsl(db_r), fabsl(db_i)); + rtmax =sqrt(safmax/2.); + if (g1 > rtmin && g1 < rtmax) { // unscaled + d = sqrt(adb); + *S = *S1 /d; + *(S+1) = *(S1+1) /d; + *DA = d ; + *(DA+1) = ZERO; + return; + } else { // scaled algorithm + long double u = MIN ( safmax, MAX ( safmin, g1)); + FLOAT gs_r = db_r/u; + FLOAT gs_i = db_i/u; + d = sqrt ( gs_r*gs_r + gs_i*gs_i); + *S = gs_r / d; + *(S + 1) = (gs_i * -1) / d; + *DA = d * u; + *(DA+1) = ZERO; + return; + } + } + } else { + FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i)); + FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i)); + rtmax = sqrt(safmax / 4.); + if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled + long double h = ada + adb; + double adahsq = sqrt(ada * h); + if (ada >= h *safmin) { + *C = sqrt(ada/h); + *R = *DA / *C; + *(R+1) = *(DA+1) / *C; + rtmax *= 2.; + if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow + *S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq); + *(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq); + } else { + *S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h); + *(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h); + } + } else { + *C = ada / adahsq; + if (*C >= safmin) { + *R = *DA / *C; + *(R+1) = *(DA+1) / *C; + } else { + *R = *DA * (h / adahsq); + *(R+1) = *(DA+1) * (h / adahsq); + } + *S = *S1 * ada / adahsq; + *(S+1) = *(S1+1) * ada / adahsq; + } + *DA=*R; + *(DA+1)=*(R+1); + return; + } else { // scaled + FLOAT fs_r, fs_i, gs_r, gs_i; + long double v,w,f2,g2,h; + long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1))); + gs_r = db_r/u; + gs_i = db_i/u; + g2 = sqrt ( gs_r*gs_r + gs_i*gs_i); + if (f1 /u < rtmin) { + v = MIN (safmax, MAX (safmin, f1)); + w = v / u; + fs_r = *DA/ v; + fs_i = *(DA+1) / v; + f2 = sqrt ( fs_r*fs_r + fs_i*fs_i); + h = f2 * w * w + g2; + } else { // use same scaling for both + w = 1.; + fs_r = *DA/ u; + fs_i = *(DA+1) / u; + f2 = sqrt ( fs_r*fs_r + fs_i*fs_i); + h = f2 + g2; + } + if ( f2 >= h * safmin) { + *C = sqrt ( f2 / h ); + *DA = fs_r / *C; + *(DA+1) = fs_i / *C; + rtmax *= 2; + if ( f2 > rtmin && h < rtmax) { + *S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h)); + *(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h)); + } else { + *S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h); + *(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h); + } + } else { // intermediates might overflow + d = sqrt ( f2 * h); + *C = f2 /d; + if (*C >= safmin) { + *DA = fs_r / *C; + *(DA+1) = fs_i / *C; + } else { + *DA = fs_r * (h / d); + *(DA+1) = fs_i / (h / d); + } + *S = gs_r * (fs_r /d) - gs_i * (fs_i / d); + *(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d); + } + *C *= w; + *DA *= u; + *(DA+1) *= u; + return; + } + } } + diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 60314eedb..74e6760c2 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1349,6 +1349,9 @@ endif () set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES) set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}") + if (USE_GEMM3M) + target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) + endif() endfunction () diff --git a/kernel/Makefile b/kernel/Makefile index 977886044..3f9afd3fa 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,12 +5,6 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system -ifeq ($(ARCH), power) -ifeq ($(C_COMPILER), CLANG) - override CFLAGS += -fno-integrated-as -endif -endif - AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 @@ -33,7 +27,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ10), 1) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) override CFLAGS += -march=sapphirerapids else override CFLAGS += -march=skylake-avx512 -mavx512f @@ -48,7 +42,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) endif else ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ10), 1) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9))) override CFLAGS += -march=cooperlake else override CFLAGS += -march=skylake-avx512 -mavx512f @@ -77,6 +71,12 @@ else ifeq ($(TARGET_CORE), ZEN) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else ifeq ($(TARGET_CORE), LOONGSON3R4) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) +else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),) + ifeq ($(C_COMPILER), PGI) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics + else + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + endif else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index bea6cb048..863f376e9 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -35,6 +35,12 @@ USE_TRMM = 1 endif endif +ifneq ($(DYNAMIC_ARCH), 1) +ifeq ($(TARGET), MIPS64_GENERIC) +USE_TRMM = 1 +endif +endif + ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif @@ -55,6 +61,15 @@ ifeq ($(CORE), ZEN) USE_TRMM = 1 endif +ifeq ($(OS), AIX) +M4VERSION := $(shell m4 --version < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) +ifeq ($(M4VERSION), 0) +M4_AIX := m4 -l16384 +else +M4_AIX := m4 -B16384 +endif +$(info $$var is [${$(M4_AIX)}]) +endif ifeq ($(CORE), POWER8) ifeq ($(BINARY64),1) USE_TRMM = 1 @@ -167,7 +182,7 @@ ifeq ($(BUILD_BFLOAT16),1) SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) endif -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -176,7 +191,7 @@ SBLASOBJS += \ strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) endif -ifeq ($(BUILD_DOUBLE),1) +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DBLASOBJS += \ dgemm_beta$(TSUFFIX).$(SUFFIX) \ dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -192,7 +207,7 @@ QBLASOBJS += \ qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) -ifeq ($(BUILD_COMPLEX),1) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CBLASOBJS += \ ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ @@ -240,7 +255,7 @@ XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) endif -ifeq ($(BUILD_SINGLE),1) +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SBLASOBJS += \ strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -249,10 +264,7 @@ SBLASOBJS += \ strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ - strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) -endif -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" -SBLASOBJS += \ + strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ @@ -260,10 +272,7 @@ SBLASOBJS += \ strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ - strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(BUILD_SINGLE),1) -SBLASOBJS += \ + strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) endif @@ -385,7 +394,7 @@ XBLASOBJS += \ ifeq ($(USE_GEMM3M), 1) -ifeq ($(BUILD_COMPLEX),1) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CBLASOBJS += \ cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ @@ -628,15 +637,7 @@ $(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) - -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s - m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ - rm sbgemmotcopy.s sbgemmotcopy_nomacros.s -else $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) @@ -644,14 +645,7 @@ $(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s - m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ - rm sbgemmitcopy.s sbgemmitcopy_nomacros.s -else $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif endif endif @@ -662,7 +656,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s - m4 sgemmotcopy.s > sgemmotcopy_nomacros.s + $(M4_AIX) sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s else @@ -678,7 +672,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s - m4 sgemmitcopy.s > sgemmitcopy_nomacros.s + $(M4_AIX) sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s else @@ -690,7 +684,7 @@ endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s - m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s + $(M4_AIX) dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ rm dgemm_ncopy.s dgemm_ncopy_nomacros.s else @@ -708,7 +702,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s - m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s + $(M4_AIX) dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ rm dgemm_itcopy.s dgemm_itcopy_nomacros.s else @@ -751,7 +745,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s - m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s + $(M4_AIX) cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ rm cgemm_itcopy.s cgemm_itcopy_nomacros.s else @@ -774,7 +768,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s - m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s + $(M4_AIX) zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ rm zgemm_itcopy.s zgemm_itcopy_nomacros.s else @@ -806,7 +800,7 @@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s - m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s + $(M4_AIX) sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s else @@ -823,20 +817,13 @@ endif ifeq ($(BUILD_BFLOAT16), 1) $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s - m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s -else $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif -endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s - m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s + $(M4_AIX) dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s else @@ -849,7 +836,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s - m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s + $(M4_AIX) cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s else @@ -859,7 +846,7 @@ endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s - m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s + $(M4_AIX) cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s else @@ -869,7 +856,7 @@ endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(M4_AIX) cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s else @@ -879,7 +866,7 @@ endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s - m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s + $(M4_AIX) cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s else @@ -889,7 +876,7 @@ endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s - m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s + $(M4_AIX) zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s else ifeq ($(CORE),SANDYBRIDGE) @@ -901,7 +888,7 @@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s - m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s + $(M4_AIX) zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s else ifeq ($(CORE),SANDYBRIDGE) @@ -913,7 +900,7 @@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s - m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s + $(M4_AIX) zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s else ifeq ($(CORE),SANDYBRIDGE) @@ -925,7 +912,7 @@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s - m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s + $(M4_AIX) zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s else ifeq ($(CORE),SANDYBRIDGE) @@ -951,7 +938,7 @@ ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s - m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s + $(M4_AIX) strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ rm strmmkernel_ln.s strmmkernel_ln_nomacros.s else @@ -961,7 +948,7 @@ endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s - m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s + $(M4_AIX) strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ rm strmmkernel_lt.s strmmkernel_lt_nomacros.s else @@ -971,7 +958,7 @@ endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s - m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s + $(M4_AIX) strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ rm strmmkernel_rn.s strmmkernel_rn_nomacros.s else @@ -981,7 +968,7 @@ endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s else @@ -991,7 +978,7 @@ endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s - m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s + $(M4_AIX) dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s else @@ -1001,7 +988,7 @@ endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s - m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s + $(M4_AIX) dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s else @@ -1011,7 +998,7 @@ endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s - m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s + $(M4_AIX) dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s else @@ -1021,7 +1008,7 @@ endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s - m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s + $(M4_AIX) dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s else @@ -1043,7 +1030,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s - m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s + $(M4_AIX) ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s else @@ -1053,7 +1040,7 @@ endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s - m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s + $(M4_AIX) ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s else @@ -1063,7 +1050,7 @@ endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s - m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s + $(M4_AIX) ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s else @@ -1073,7 +1060,7 @@ endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s - m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s + $(M4_AIX) ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s else @@ -1083,7 +1070,7 @@ endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s - m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s + $(M4_AIX) ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s else @@ -1093,7 +1080,7 @@ endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s - m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s + $(M4_AIX) ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s else @@ -1103,7 +1090,7 @@ endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s - m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s + $(M4_AIX) ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s else @@ -1113,7 +1100,7 @@ endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s - m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s + $(M4_AIX) ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s else @@ -1123,7 +1110,7 @@ endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s - m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s + $(M4_AIX) ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1135,7 +1122,7 @@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s - m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s + $(M4_AIX) ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1147,7 +1134,7 @@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s - m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s + $(M4_AIX) ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1159,7 +1146,7 @@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s - m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s + $(M4_AIX) ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1171,7 +1158,7 @@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s - m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s + $(M4_AIX) ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1183,7 +1170,7 @@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s - m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s + $(M4_AIX) ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1195,7 +1182,7 @@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s - m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s + $(M4_AIX) ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1207,7 +1194,7 @@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s - m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s + $(M4_AIX) ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1229,7 +1216,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s else @@ -1389,7 +1376,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s - m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s + $(M4_AIX) dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s else @@ -2981,7 +2968,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(M4_AIX) cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s else @@ -3027,7 +3014,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s + $(M4_AIX) strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s else diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c index fcff09337..8cc189fe3 100644 --- a/kernel/arm/nrm2.c +++ b/kernel/arm/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c index fc1c8b54a..28bb0eda5 100644 --- a/kernel/arm/znrm2.c +++ b/kernel/arm/znrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index 0521aaa0b..b2d537d04 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index f936cdf47..7d7e648c4 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -1,3 +1,5 @@ +CSUMKERNEL=csum.S + ifndef SNRM2KERNEL SNRM2KERNEL = ../arm/nrm2.c endif diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index bd25f7cd8..bc5999097 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -1,216 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index bd25f7cd8..eeb4844bf 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -1,3 +1,6 @@ + +CSUMKERNEL = csum_thunderx2t99.c +ZSUMKERNEL = zsum_thunderx2t99.c SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c @@ -57,7 +60,7 @@ CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S @@ -81,45 +84,35 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S @@ -128,10 +121,10 @@ SGEMM_BETA = sgemm_beta.S SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S +SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c +SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) @@ -149,8 +142,8 @@ SSYMMLCOPY_M = symm_lcopy_sve.c DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c +DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S @@ -170,8 +163,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c @@ -194,8 +187,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c diff --git a/kernel/arm64/KERNEL.CORTEXA510 b/kernel/arm64/KERNEL.CORTEXA510 index bd25f7cd8..bc5999097 100644 --- a/kernel/arm64/KERNEL.CORTEXA510 +++ b/kernel/arm64/KERNEL.CORTEXA510 @@ -1,216 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index e2e006770..574e98b8c 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -1,196 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -else -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -endif -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +include $(KERNELDIR)/KERNEL.CORTEXA53 diff --git a/kernel/arm64/KERNEL.CORTEXA710 b/kernel/arm64/KERNEL.CORTEXA710 index bd25f7cd8..bc5999097 100644 --- a/kernel/arm64/KERNEL.CORTEXA710 +++ b/kernel/arm64/KERNEL.CORTEXA710 @@ -1,216 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.CORTEXX2 b/kernel/arm64/KERNEL.CORTEXX2 index bd25f7cd8..bc5999097 100644 --- a/kernel/arm64/KERNEL.CORTEXX2 +++ b/kernel/arm64/KERNEL.CORTEXX2 @@ -1,216 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 9a5938459..bc5999097 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,189 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(SGEMM_UNROLL_N), 16) -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -else -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -endif -ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -else -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -endif -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index 41cedc851..5d3bd69f7 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -1,184 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) -DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) -SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S -endif - -ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) -CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) -ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S -endif +include $(KERNELDIR)/KERNEL.THUNDERX2T99 diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index 24e08a646..f100adc7a 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 +#define alphaI w19 #define alpha0_R s10 #define alphaV0_R v10.s[0] diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S index 29a68ff22..2c63925be 100644 --- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 +#define alphaI w19 #define alpha0_R s10 #define alphaV0_R v10.s[0] diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S index 38770f66b..2136ebbee 100644 --- a/kernel/arm64/cgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S @@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, 32 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M1 @@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rw z15.s, p0/z, [pB, 28] add pB, pB, 32 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M2 @@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri z23.s, p1/m, z2.s, z15.s ld1rw z15.s, p0/z, [pB, 28] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - add pB, pB, 32 - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] .endm .macro KERNELv1x4_E @@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ii z22.s, p1/m, z3.s, z15.s OP_ri z23.s, p1/m, z2.s, z15.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] - .endm .macro KERNELv1x4_SUB @@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ii z22.s, p1/m, z1.s, z15.s OP_ri z23.s, p1/m, z0.s, z15.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2w {z26.s, z27.s}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #3 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld2w {z28.s, z29.s}, p1/z, [pCRow2] fmla z28.s, p1/m, z20.s, alphaz_R fmls z28.s, p1/m, z21.s, alphaz_I @@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z31.s, p1/m, z23.s, alphaz_R st2w {z30.s, z31.s}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2w {z26.s, z27.s}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #3 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] - fmov alphaR, s0 dup alphaz_R, alphaR fmov alphaI, s1 @@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne .Lcgemm_kernel_L4_Mv1_46 .Lcgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA] - prfm PLDL1KEEP, [pA, #64] - prfm PLDL1KEEP, [origPB] - SAVEv1x4 .Lcgemm_kernel_L4_Mv1_END: diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c index 6aa44a8f6..2fdaf5fcd 100644 --- a/kernel/arm64/cgemm_ncopy_sve_v1.c +++ b/kernel/arm64/cgemm_ncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b32(j, n); + svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); uint32_t active = svcntp_b32(svptrue_b32(), pg); do { @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * lda * 2; j += svcntw(); - pg = svwhilelt_b32(j, n); + pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); active = svcntp_b32(svptrue_b32(), pg); diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c index 748cd954e..086a2fed1 100644 --- a/kernel/arm64/cgemm_tcopy_sve_v1.c +++ b/kernel/arm64/cgemm_tcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b32(j, n); + svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); uint32_t active = svcntp_b32(svptrue_b32(), pg); do { @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * 2; j += svcntw(); - pg = svwhilelt_b32(j, n); + pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/csum_thunderx2t99.c b/kernel/arm64/csum_thunderx2t99.c new file mode 100644 index 000000000..874f4eb5a --- /dev/null +++ b/kernel/arm64/csum_thunderx2t99.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include + +#define N "x0" /* vector length */ +#define X "x1" /* "X" vector address */ +#define INC_X "x2" /* "X" stride */ +#define J "x5" /* loop variable */ + +#define REG0 "wzr" +#define SUMF "s0" +#define SUMFD "d0" + +/******************************************************************************/ + +#define KERNEL_F1 \ + "ldr d1, ["X"] \n" \ + "add "X", "X", #8 \n" \ + "ext v2.8b, v1.8b, v1.8b, #4 \n" \ + "fadd s1, s1, s2 \n" \ + "fadd "SUMF", "SUMF", s1 \n" + +#define KERNEL_F32 \ + "ldr q16, ["X"] \n" \ + "ldr q17, ["X", #16] \n" \ + "ldr q18, ["X", #32] \n" \ + "ldr q19, ["X", #48] \n" \ + "ldp q20, q21, ["X", #64] \n" \ + "ldp q22, q23, ["X", #96] \n" \ + "ldp q24, q25, ["X", #128] \n" \ + "ldp q26, q27, ["X", #160] \n" \ + "fadd v16.4s, v16.4s, v17.4s \n" \ + "fadd v18.4s, v18.4s, v19.4s \n" \ + "ldp q28, q29, ["X", #192] \n" \ + "ldp q30, q31, ["X", #224] \n" \ + "add "X", "X", #256 \n" \ + "fadd v20.4s, v20.4s, v21.4s \n" \ + "fadd v22.4s, v22.4s, v23.4s \n" \ + "PRFM PLDL1KEEP, ["X", #1024] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ + "fadd v24.4s, v24.4s, v25.4s \n" \ + "fadd v26.4s, v26.4s, v27.4s \n" \ + "fadd v0.4s, v0.4s, v16.4s \n" \ + "fadd v1.4s, v1.4s, v18.4s \n" \ + "fadd v2.4s, v2.4s, v20.4s \n" \ + "fadd v3.4s, v3.4s, v22.4s \n" \ + "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ + "fadd v28.4s, v28.4s, v29.4s \n" \ + "fadd v30.4s, v30.4s, v31.4s \n" \ + "fadd v4.4s, v4.4s, v24.4s \n" \ + "fadd v5.4s, v5.4s, v26.4s \n" \ + "fadd v6.4s, v6.4s, v28.4s \n" \ + "fadd v7.4s, v7.4s, v30.4s \n" + +#define KERNEL_F32_FINALIZE \ + "fadd v0.4s, v0.4s, v1.4s \n" \ + "fadd v2.4s, v2.4s, v3.4s \n" \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v6.4s, v6.4s, v7.4s \n" \ + "fadd v0.4s, v0.4s, v2.4s \n" \ + "fadd v4.4s, v4.4s, v6.4s \n" \ + "fadd v0.4s, v0.4s, v4.4s \n" \ + "ext v1.16b, v0.16b, v0.16b, #8 \n" \ + "fadd v0.2s, v0.2s, v1.2s \n" \ + "faddp "SUMF", v0.2s \n" + +#define INIT_S \ + "lsl "INC_X", "INC_X", #3 \n" + +#define KERNEL_S1 \ + "ldr d1, ["X"] \n" \ + "add "X", "X", "INC_X" \n" \ + "ext v2.8b, v1.8b, v1.8b, #4 \n" \ + "fadd s1, s1, s2 \n" \ + "fadd "SUMF", "SUMF", s1 \n" + + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + +static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asum = 0.0 ; + + if ( n < 0 ) return(asum); + + __asm__ __volatile__ ( + " mov "N", %[N_] \n" + " mov "X", %[X_] \n" + " mov "INC_X", %[INCX_] \n" + " fmov "SUMF", "REG0" \n" + " fmov s1, "REG0" \n" + " fmov s2, "REG0" \n" + " fmov s3, "REG0" \n" + " fmov s4, "REG0" \n" + " fmov s5, "REG0" \n" + " fmov s6, "REG0" \n" + " fmov s7, "REG0" \n" + " cmp "N", xzr \n" + " ble 9f //asum_kernel_L999 \n" + " cmp "INC_X", xzr \n" + " ble 9f //asum_kernel_L999 \n" + " cmp "INC_X", #1 \n" + " bne 5f //asum_kernel_S_BEGIN \n" + + "1: //asum_kernel_F_BEGIN: \n" + " asr "J", "N", #5 \n" + " cmp "J", xzr \n" + " beq 3f //asum_kernel_F1 \n" + + "2: //asum_kernel_F32: \n" + " "KERNEL_F32" \n" + " subs "J", "J", #1 \n" + " bne 2b //asum_kernel_F32 \n" + " "KERNEL_F32_FINALIZE" \n" + + "3: //asum_kernel_F1: \n" + " ands "J", "N", #31 \n" + " ble 9f //asum_kernel_L999 \n" + + "4: //asum_kernel_F10: \n" + " "KERNEL_F1" \n" + " subs "J", "J", #1 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" + + "5: //asum_kernel_S_BEGIN: \n" + " "INIT_S" \n" + " asr "J", "N", #2 \n" + " cmp "J", xzr \n" + " ble 7f //asum_kernel_S1 \n" + + "6: //asum_kernel_S4: \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne 6b //asum_kernel_S4 \n" + + "7: //asum_kernel_S1: \n" + " ands "J", "N", #3 \n" + " ble 9f //asum_kernel_L999 \n" + + "8: //asum_kernel_S10: \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne 8b //asum_kernel_S10 \n" + + "9: //asum_kernel_L999: \n" + " fmov %[ASUM_], "SUMFD" \n" + + : [ASUM_] "=r" (asum) //%0 + : [N_] "r" (n), //%1 + [X_] "r" (x), //%2 + [INCX_] "r" (inc_x) //%3 + : "cc", + "memory", + "x0", "x1", "x2", "x3", "x4", "x5", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); + + return asum; +} + +#if defined(SMP) +static int casum_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + *result = casum_compute(n, x, inc_x); + + return 0; +} +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT asum = 0.0; + +#if defined(SMP) + if (inc_x == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + asum = casum_compute(n, x, inc_x); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + FLOAT *ptr; + + mode = BLAS_SINGLE | BLAS_COMPLEX; + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, NULL, 0, result, 0, + ( void *)casum_thread_function, nthreads); + + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + asum = asum + (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + asum = casum_compute(n, x, inc_x); +#endif + + return asum; +} diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index 5c0827397..e8f1d8cf3 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 -#define temp x19 -#define tempOffset x20 -#define tempK x21 +#define alphaI w19 +#define temp x20 +#define tempOffset x21 +#define tempK x22 #define alpha0_R s10 #define alphaV0_R v10.s[0] diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c deleted file mode 100644 index 1f812c775..000000000 --- a/kernel/arm64/dgemm_ncopy_sve_v1.c +++ /dev/null @@ -1,79 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -// TODO: write in assembly with proper unrolling of inner loop -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; - - svint64_t lda_vec = svindex_s64(0LL, lda); - uint64_t sve_size = svcntd(); - - aoffset = a; - boffset = b; - - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { - - aoffset1 = aoffset; - - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1++; - boffset += active; - } - aoffset += sve_size * lda; - - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); - - - } while (svptest_any(svptrue_b64(), pg)); - - return 0; -} diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c deleted file mode 100644 index cb645a1b6..000000000 --- a/kernel/arm64/dgemm_tcopy_sve_v1.c +++ /dev/null @@ -1,77 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -// TODO: write in assembly with proper unrolling of inner loop -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; - - uint64_t sve_size = svcntd(); - - aoffset = a; - boffset = b; - - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { - - aoffset1 = aoffset; - - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1(pg, (double *)aoffset1); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1 += lda; - boffset += active; - } - aoffset += sve_size; - - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); - - } while (svptest_any(svptrue_b64(), pg)); - - return 0; -} diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 8460e0d5e..16f4cd537 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -1,4 +1,5 @@ /*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project Copyright (c) 2022, Arm Ltd All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() +#define DTYPE "d" +#define WIDTH "d" +#define SHIFT "3" #else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() +#define DTYPE "s" +#define WIDTH "w" +#define SHIFT "2" #endif -static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { - SVE_TYPE acc_a = SVE_ZERO; - SVE_TYPE acc_b = SVE_ZERO; +#define COUNT \ +" cnt"WIDTH" x9 \n" +#define SETUP_TRUE \ +" ptrue p0."DTYPE" \n" +#define OFFSET_INPUTS \ +" add x12, %[X_], x9, lsl #"SHIFT" \n" \ +" add x13, %[Y_], x9, lsl #"SHIFT" \n" +#define TAIL_WHILE \ +" whilelo p1."DTYPE", x8, x0 \n" +#define UPDATE(pg, x,y,out) \ +" ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \ +" ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \ +" fmla "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE" \n" +#define SUM_VECTOR(v) \ +" faddv "DTYPE""v", p0, z"v"."DTYPE" \n" +#define RET \ +" fadd %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0 \n" - BLASLONG sve_width = SVE_WIDTH; +#define DOT_KERNEL \ + COUNT \ +" mov z1.d, #0 \n" \ +" mov z0.d, #0 \n" \ +" mov x8, #0 \n" \ +" movi d1, #0x0 \n" \ + SETUP_TRUE \ +" neg x10, x9, lsl #1 \n" \ +" ands x11, x10, x0 \n" \ +" b.eq 2f // skip_2x \n" \ + OFFSET_INPUTS \ +"1: // vector_2x \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z1") \ + UPDATE("p0", "x12", "x13", "z0") \ +" sub x8, x8, x10 \n" \ +" cmp x8, x11 \n" \ +" b.lo 1b // vector_2x \n" \ + SUM_VECTOR("1") \ +"2: // skip_2x \n" \ +" neg x10, x9 \n" \ +" and x10, x10, x0 \n" \ +" cmp x8, x10 \n" \ +" b.hs 4f // tail \n" \ +"3: // vector_1x \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z0") \ +" add x8, x8, x9 \n" \ +" cmp x8, x10 \n" \ +" b.lo 3b // vector_1x \n" \ +"4: // tail \n" \ +" cmp x10, x0 \n" \ +" b.eq 5f // end \n" \ + TAIL_WHILE \ + UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ +"5: // end \n" \ + SUM_VECTOR("0") \ + RET - for (BLASLONG i = 0; i < n; i += sve_width * 2) { - svbool_t pg_a = SVE_WHILELT(i, n); - svbool_t pg_b = SVE_WHILELT(i + sve_width, n); +static +FLOAT +dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) +{ + FLOAT ret; - SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); - SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); - SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); - SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + asm(DOT_KERNEL + : + [RET_] "=&w" (ret) + : + [N_] "r" (n), + [X_] "r" (x), + [Y_] "r" (y) + :); - acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); - acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); - } - - return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); + return ret; } diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index e342b0b63..6077c85dd 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" - +#include #include #if defined(SMP) @@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else nrm2_compute(n, x, inc_x, &ssq, &scale); #endif - if (fabs(scale) <1.e-300) return 0.; + volatile FLOAT sca = fabs(scale); + if (sca < DBL_MIN) return 0.; ssq = sqrt(ssq) * scale; return ssq; diff --git a/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c new file mode 100644 index 000000000..90f867b44 --- /dev/null +++ b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_INDEX svuint64_t +#define SV_INDEXER svindex_u64 +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_INDEX svuint32_t +#define SV_INDEXER svindex_u32 +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \ + a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \ + svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \ + a_offset_inner += 2; \ + b_offset += active * 2; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + uint64_t sve_size; + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2); + SV_TYPE a_vec_real; + SV_TYPE a_vec_imag; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * lda * 2; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + diff --git a/kernel/arm64/gemm_ncopy_sve_v1x8.c b/kernel/arm64/gemm_ncopy_sve_v1x8.c new file mode 100644 index 000000000..7b2a2e767 --- /dev/null +++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_INDEX svuint64_t +#define SV_INDEXER svindex_u64 +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#define SV_PREFETCH svprfd_gather_index +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_INDEX svuint32_t +#define SV_INDEXER svindex_u32 +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#define SV_PREFETCH svprfw_gather_index +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \ + svst1(pg, b_offset, a_vec); \ + a_offset_inner++; \ + b_offset += active; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + uint64_t sve_size; + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_INDEX lda_vec = SV_INDEXER(0LL, lda); + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 3; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 4) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * lda; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} diff --git a/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c new file mode 100644 index 000000000..975166a2e --- /dev/null +++ b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64x2_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32x2_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld2(pg, a_offset_inner); \ + svst2(pg, b_offset, a_vec); \ + a_offset_inner += lda * 2; \ + b_offset += active * 2; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + uint64_t sve_size = svcntw(); + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * 2; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + + diff --git a/kernel/arm64/gemm_tcopy_sve_v1x8.c b/kernel/arm64/gemm_tcopy_sve_v1x8.c new file mode 100644 index 000000000..9a93b6cb7 --- /dev/null +++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld1(pg, a_offset_inner); \ + svst1(pg, b_offset, a_vec); \ + a_offset_inner += lda; \ + b_offset += active; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + uint64_t sve_size = svcntw(); + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 3; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 4) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S old mode 100755 new mode 100644 diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c deleted file mode 100644 index 1bc186335..000000000 --- a/kernel/arm64/sgemm_ncopy_sve_v1.c +++ /dev/null @@ -1,78 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -// TODO: write in assembly with proper unrolling of inner loop -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; - - svint32_t lda_vec = svindex_s32(0LL, lda); - uint32_t sve_size = svcntw(); - - aoffset = a; - boffset = b; - - j = 0; - svbool_t pg = svwhilelt_b32(j, n); - uint32_t active = svcntp_b32(svptrue_b32(), pg); - do { - - aoffset1 = aoffset; - - uint32_t i_cnt = m; - while (i_cnt--) { - svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); - svst1_f32(pg, (float *) boffset, a_vec); - aoffset1++; - boffset += active; - } - aoffset += sve_size * lda; - - j += svcntw(); - pg = svwhilelt_b32(j, n); - active = svcntp_b32(svptrue_b32(), pg); - - } while (svptest_any(svptrue_b32(), pg)); - - return 0; -} diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c deleted file mode 100644 index 9f8cf502a..000000000 --- a/kernel/arm64/sgemm_tcopy_sve_v1.c +++ /dev/null @@ -1,77 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -// TODO: write in assembly with proper unrolling of inner loop -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; - - uint32_t sve_size = svcntw(); - - aoffset = a; - boffset = b; - - j = 0; - svbool_t pg = svwhilelt_b32(j, n); - uint32_t active = svcntp_b32(svptrue_b32(), pg); - do { - - aoffset1 = aoffset; - - uint32_t i_cnt = m; - while (i_cnt--) { - svfloat32_t a_vec = svld1(pg, (float *) aoffset1); - svst1_f32(pg, (float *) boffset, a_vec); - aoffset1 += lda; - boffset += active; - } - aoffset += sve_size; - - j += svcntw(); - pg = svwhilelt_b32(j, n); - active = svcntp_b32(svptrue_b32(), pg); - - } while (svptest_any(svptrue_b32(), pg)); - - return 0; -} diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c index 6ba4afc8b..e138f0647 100644 --- a/kernel/arm64/symm_lcopy_sve.c +++ b/kernel/arm64/symm_lcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c index 32da5bd16..9a4cb6d4f 100644 --- a/kernel/arm64/symm_ucopy_sve.c +++ b/kernel/arm64/symm_ucopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index 918e945ac..c7f79e3fd 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index b76cc56de..b3ba68973 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index 75fa163ae..a47d2096c 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index 36a03242a..c5188beb4 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c index 5a9d4194a..2895eb85d 100644 --- a/kernel/arm64/trsm_lncopy_sve.c +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c index ac4019e26..fdda992e0 100644 --- a/kernel/arm64/trsm_ltcopy_sve.c +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c index 8fdcd0f4b..1a03aa974 100644 --- a/kernel/arm64/trsm_uncopy_sve.c +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c index 0f5f0dccd..b06166f36 100644 --- a/kernel/arm64/trsm_utcopy_sve.c +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 728f97fb3..6f65e5cfd 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#if (NVCOMPVERS < 2309) +#pragma opt 1 +#endif +#endif #include "common.h" diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index d5b35775c..a043948d6 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M1 @@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M2 @@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri z23.d, p1/m, z2.d, z15.d ld1rd z15.d, p0/z, [pB, 56] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - add pB, pB, 64 - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] .endm .macro KERNELv1x4_E @@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ir z23.d, p1/m, z3.d, z14.d OP_ii z22.d, p1/m, z3.d, z15.d OP_ri z23.d, p1/m, z2.d, z15.d - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] - .endm .macro KERNELv1x4_SUB @@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ir z23.d, p1/m, z1.d, z14.d OP_ii z22.d, p1/m, z1.d, z15.d OP_ri z23.d, p1/m, z0.d, z15.d - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2d {z26.d, z27.d}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #4 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2d {z28.d, z29.d}, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaz_R @@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z31.d, p1/m, z23.d, alphaz_R st2d {z30.d, z31.d}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2d {z26.d, z27.d}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #4 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] - fmov alphaR, d0 dup alphaz_R, alphaR fmov alphaI, d1 @@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne .Lzgemm_kernel_L4_Mv1_46 .Lzgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA] - prfm PLDL1KEEP, [pA, #64] - prfm PLDL1KEEP, [origPB] - SAVEv1x4 .Lzgemm_kernel_L4_Mv1_END: diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index 8f9b4268a..6b8c93baf 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); uint64_t active = svcntp_b64(svptrue_b64(), pg); do { @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * lda * 2; j += svcntd(); - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index c6e50bc1c..fd8d2190f 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); uint64_t active = svcntp_b64(svptrue_b64(), pg); do { @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * 2; j += svcntd(); - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c index 37dbfe4e1..fcf7e7073 100644 --- a/kernel/arm64/zhemm_ltcopy_sve.c +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64(offset, 0LL); + svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t j = 0; int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); + svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c index 21e03b7be..056c9824e 100644 --- a/kernel/arm64/zhemm_utcopy_sve.c +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64(offset, 0LL); + svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); #else @@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t j = 0; int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); + svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index 929455975..4bd43320d 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -223,7 +223,7 @@ zscal_begin: fcmp DA_I, #0.0 beq .Lzscal_kernel_RI_zero - b .Lzscal_kernel_R_zero +// b .Lzscal_kernel_R_zero .Lzscal_kernel_R_non_zero: diff --git a/kernel/arm64/zsum_thunderx2t99.c b/kernel/arm64/zsum_thunderx2t99.c new file mode 100644 index 000000000..087dae2fe --- /dev/null +++ b/kernel/arm64/zsum_thunderx2t99.c @@ -0,0 +1,244 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include + +#define N "x0" /* vector length */ +#define X "x1" /* "X" vector address */ +#define INC_X "x2" /* "X" stride */ +#define J "x5" /* loop variable */ + +#define REG0 "xzr" +#define SUMF "d0" +#define TMPF "d1" + +/******************************************************************************/ + +#define KERNEL_F1 \ + "ldr q1, ["X"] \n" \ + "add "X", "X", #16 \n" \ + "faddp d1, v1.2d \n" \ + "fadd "SUMF", "SUMF", d1 \n" + +#define KERNEL_F16 \ + "ldr q16, ["X"] \n" \ + "ldr q17, ["X", #16] \n" \ + "ldr q18, ["X", #32] \n" \ + "ldr q19, ["X", #48] \n" \ + "ldp q20, q21, ["X", #64] \n" \ + "ldp q22, q23, ["X", #96] \n" \ + "ldp q24, q25, ["X", #128] \n" \ + "ldp q26, q27, ["X", #160] \n" \ + "fadd v16.2d, v16.2d, v17.2d \n" \ + "fadd v18.2d, v18.2d, v19.2d \n" \ + "ldp q28, q29, ["X", #192] \n" \ + "ldp q30, q31, ["X", #224] \n" \ + "add "X", "X", #256 \n" \ + "fadd v20.2d, v20.2d, v21.2d \n" \ + "fadd v22.2d, v22.2d, v23.2d \n" \ + "PRFM PLDL1KEEP, ["X", #1024] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ + "fadd v24.2d, v24.2d, v25.2d \n" \ + "fadd v26.2d, v26.2d, v27.2d \n" \ + "fadd v28.2d, v28.2d, v29.2d \n" \ + "fadd v30.2d, v30.2d, v31.2d \n" \ + "fadd v0.2d, v0.2d, v16.2d \n" \ + "fadd v1.2d, v1.2d, v18.2d \n" \ + "fadd v2.2d, v2.2d, v20.2d \n" \ + "fadd v3.2d, v3.2d, v22.2d \n" \ + "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ + "fadd v4.2d, v4.2d, v24.2d \n" \ + "fadd v5.2d, v5.2d, v26.2d \n" \ + "fadd v6.2d, v6.2d, v28.2d \n" \ + "fadd v7.2d, v7.2d, v30.2d \n" + +#define KERNEL_F16_FINALIZE \ + "fadd v0.2d, v0.2d, v1.2d \n" \ + "fadd v2.2d, v2.2d, v3.2d \n" \ + "fadd v4.2d, v4.2d, v5.2d \n" \ + "fadd v6.2d, v6.2d, v7.2d \n" \ + "fadd v0.2d, v0.2d, v2.2d \n" \ + "fadd v4.2d, v4.2d, v6.2d \n" \ + "fadd v0.2d, v0.2d, v4.2d \n" \ + "faddp "SUMF", v0.2d \n" + +#define INIT_S \ + "lsl "INC_X", "INC_X", #4 \n" + +#define KERNEL_S1 \ + "ldr q1, ["X"] \n" \ + "add "X", "X", "INC_X" \n" \ + "faddp d1, v1.2d \n" \ + "fadd "SUMF", "SUMF", d1 \n" + + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + +static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asum = 0.0 ; + + if ( n < 0 ) return(asum); + + __asm__ __volatile__ ( + " mov "N", %[N_] \n" + " mov "X", %[X_] \n" + " mov "INC_X", %[INCX_] \n" + " fmov "SUMF", "REG0" \n" + " fmov d1, "REG0" \n" + " fmov d2, "REG0" \n" + " fmov d3, "REG0" \n" + " fmov d4, "REG0" \n" + " fmov d5, "REG0" \n" + " fmov d6, "REG0" \n" + " fmov d7, "REG0" \n" + " cmp "N", xzr \n" + " ble 9f //asum_kernel_L999 \n" + " cmp "INC_X", xzr \n" + " ble 9f //asum_kernel_L999 \n" + " cmp "INC_X", #1 \n" + " bne 5f //asum_kernel_S_BEGIN \n" + + "1: //asum_kernel_F_BEGIN: \n" + " asr "J", "N", #4 \n" + " cmp "J", xzr \n" + " beq 3f //asum_kernel_F1 \n" + + ".align 5 \n" + "2: //asum_kernel_F16: \n" + " "KERNEL_F16" \n" + " subs "J", "J", #1 \n" + " bne 2b //asum_kernel_F16 \n" + " "KERNEL_F16_FINALIZE" \n" + + "3: //asum_kernel_F1: \n" + " ands "J", "N", #15 \n" + " ble 9f //asum_kernel_L999 \n" + + "4: //asum_kernel_F10: \n" + " "KERNEL_F1" \n" + " subs "J", "J", #1 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" + + "5: //asum_kernel_S_BEGIN: \n" + " "INIT_S" \n" + " asr "J", "N", #2 \n" + " cmp "J", xzr \n" + " ble 7f //asum_kernel_S1 \n" + + "6: //asum_kernel_S4: \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne 6b //asum_kernel_S4 \n" + + "7: //asum_kernel_S1: \n" + " ands "J", "N", #3 \n" + " ble 9f //asum_kernel_L999 \n" + + "8: //asum_kernel_S10: \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne 8b //asum_kernel_S10 \n" + + "9: //asum_kernel_L999: \n" + " fmov %[ASUM_], "SUMF" \n" + + : [ASUM_] "=r" (asum) //%0 + : [N_] "r" (n), //%1 + [X_] "r" (x), //%2 + [INCX_] "r" (inc_x) //%3 + : "cc", + "memory", + "x0", "x1", "x2", "x3", "x4", "x5", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); + + return asum; +} + +#if defined(SMP) +static int zasum_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + *result = zasum_compute(n, x, inc_x); + + return 0; +} +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT asum = 0.0; + +#if defined(SMP) + if (inc_x == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + asum = zasum_compute(n, x, inc_x); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + FLOAT *ptr; + + mode = BLAS_DOUBLE | BLAS_COMPLEX; + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, NULL, 0, result, 0, + ( void *)zasum_thread_function, nthreads); + + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + asum = asum + (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + asum = zasum_compute(n, x, inc_x); +#endif + + return asum; +} diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c index 6f18aa956..5a17d3b19 100644 --- a/kernel/arm64/zsymm_lcopy_sve.c +++ b/kernel/arm64/zsymm_lcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c index 6be48cdaf..06989e3aa 100644 --- a/kernel/arm64/zsymm_ucopy_sve.c +++ b/kernel/arm64/zsymm_ucopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c index d34f607ab..5a7171d9d 100644 --- a/kernel/arm64/ztrmm_lncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c index 7f34c9857..3a88f26b2 100644 --- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c index 7eb9452c9..c3dbdcbe3 100644 --- a/kernel/arm64/ztrmm_uncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c index 60c8ff3b4..ddfa7ba4e 100644 --- a/kernel/arm64/ztrmm_utcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c index eb7cd0294..f81ba14c2 100644 --- a/kernel/arm64/ztrsm_lncopy_sve.c +++ b/kernel/arm64/ztrsm_lncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c index 34dbf8a30..46a11abed 100644 --- a/kernel/arm64/ztrsm_ltcopy_sve.c +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c index 92e086b75..436112130 100644 --- a/kernel/arm64/ztrsm_uncopy_sve.c +++ b/kernel/arm64/ztrsm_uncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c index ccb942e1b..ddf3e265f 100644 --- a/kernel/arm64/ztrsm_utcopy_sve.c +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/csky/KERNEL b/kernel/csky/KERNEL new file mode 100644 index 000000000..afa8a0881 --- /dev/null +++ b/kernel/csky/KERNEL @@ -0,0 +1,149 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c +LSAME_KERNEL = ../generic/lsame.c + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + + diff --git a/kernel/csky/Makefile b/kernel/csky/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/csky/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/generic/trmmkernel_16x8.c b/kernel/generic/trmmkernel_16x8.c new file mode 100644 index 000000000..5412eab70 --- /dev/null +++ b/kernel/generic/trmmkernel_16x8.c @@ -0,0 +1,3676 @@ +#include "common.h" + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + FLOAT res0_4; + FLOAT res0_5; + FLOAT res0_6; + FLOAT res0_7; + + FLOAT res0_8; + FLOAT res0_9; + FLOAT res0_10; + FLOAT res0_11; + FLOAT res0_12; + FLOAT res0_13; + FLOAT res0_14; + FLOAT res0_15; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + FLOAT res1_4; + FLOAT res1_5; + FLOAT res1_6; + FLOAT res1_7; + + FLOAT res1_8; + FLOAT res1_9; + FLOAT res1_10; + FLOAT res1_11; + FLOAT res1_12; + FLOAT res1_13; + FLOAT res1_14; + FLOAT res1_15; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + FLOAT res2_4; + FLOAT res2_5; + FLOAT res2_6; + FLOAT res2_7; + + FLOAT res2_8; + FLOAT res2_9; + FLOAT res2_10; + FLOAT res2_11; + FLOAT res2_12; + FLOAT res2_13; + FLOAT res2_14; + FLOAT res2_15; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + FLOAT res3_4; + FLOAT res3_5; + FLOAT res3_6; + FLOAT res3_7; + + FLOAT res3_8; + FLOAT res3_9; + FLOAT res3_10; + FLOAT res3_11; + FLOAT res3_12; + FLOAT res3_13; + FLOAT res3_14; + FLOAT res3_15; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + FLOAT res4_4; + FLOAT res4_5; + FLOAT res4_6; + FLOAT res4_7; + + FLOAT res4_8; + FLOAT res4_9; + FLOAT res4_10; + FLOAT res4_11; + FLOAT res4_12; + FLOAT res4_13; + FLOAT res4_14; + FLOAT res4_15; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + FLOAT res5_4; + FLOAT res5_5; + FLOAT res5_6; + FLOAT res5_7; + + FLOAT res5_8; + FLOAT res5_9; + FLOAT res5_10; + FLOAT res5_11; + FLOAT res5_12; + FLOAT res5_13; + FLOAT res5_14; + FLOAT res5_15; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + FLOAT res6_4; + FLOAT res6_5; + FLOAT res6_6; + FLOAT res6_7; + + FLOAT res6_8; + FLOAT res6_9; + FLOAT res6_10; + FLOAT res6_11; + FLOAT res6_12; + FLOAT res6_13; + FLOAT res6_14; + FLOAT res6_15; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + FLOAT res7_4; + FLOAT res7_5; + FLOAT res7_6; + FLOAT res7_7; + + FLOAT res7_8; + FLOAT res7_9; + FLOAT res7_10; + FLOAT res7_11; + FLOAT res7_12; + FLOAT res7_13; + FLOAT res7_14; + FLOAT res7_15; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; + if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; + if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; + if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; + if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; + if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; + if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; + if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + if (offset > -8) ao9 += lda; else ao9 += 2; + if (offset > -9) ao10 += lda; else ao10 += 2; + if (offset > -10) ao11 += lda; else ao11 += 2; + if (offset > -11) ao12 += lda; else ao12 += 2; + if (offset > -12) ao13 += lda; else ao13 += 2; + if (offset > -13) ao14 += lda; else ao14 += 2; + if (offset > -14) ao15 += lda; else ao15 += 2; + if (offset > -15) ao16 += lda; else ao16 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + } else + if (offset < -15) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -8 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = ZERO; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -9 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = ZERO; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -10 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = ZERO; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -11 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = ZERO; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -12 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = ZERO; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -13 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = ZERO; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -14 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = ZERO; + b[30] = data31; + b[31] = data32; + break; + case -15 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = ZERO; + break; + } + } + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_16.c b/kernel/generic/zhemm_utcopy_16.c new file mode 100644 index 000000000..822483a83 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_16.c @@ -0,0 +1,1168 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; + if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; + if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; + if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; + if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; + if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; + if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; + if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + if (offset > -8) ao9 += 2; else ao9 += lda; + if (offset > -9) ao10 += 2; else ao10 += lda; + if (offset > -10) ao11 += 2; else ao11 += lda; + if (offset > -11) ao12 += 2; else ao12 += lda; + if (offset > -12) ao13 += 2; else ao13 += lda; + if (offset > -13) ao14 += 2; else ao14 += lda; + if (offset > -14) ao15 += 2; else ao15 += lda; + if (offset > -15) ao16 += 2; else ao16 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + } else + if (offset < -15) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -8 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = ZERO; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -9 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = ZERO; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -10 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = ZERO; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -11 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = ZERO; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -12 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = ZERO; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -13 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = ZERO; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -14 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = ZERO; + b[30] = data31; + b[31] = -data32; + break; + case -15 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = ZERO; + break; + } + } + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zimatcopy_cnc.c b/kernel/generic/zimatcopy_cnc.c index 8e772bd8a..6426cffc0 100644 --- a/kernel/generic/zimatcopy_cnc.c +++ b/kernel/generic/zimatcopy_cnc.c @@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); - if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); aptr = a; lda *= 2; diff --git a/kernel/generic/zlaswp_ncopy_8.c b/kernel/generic/zlaswp_ncopy_8.c new file mode 100644 index 000000000..8bd41749d --- /dev/null +++ b/kernel/generic/zlaswp_ncopy_8.c @@ -0,0 +1,1051 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, A3, A4, A5, A6, A7, A8; + FLOAT B1, B2, B3, B4, B5, B6, B7, B8; + + FLOAT A9, A10, A11, A12, A13, A14, A15, A16; + FLOAT B9, B10, B11, B12, B13, B14, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + for( int pass = 0; pass < 2; ++pass ) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + b1 += 4*lda; + b2 += 4*lda; + b3 += 4*lda; + b4 += 4*lda; + b5 += 4*lda; + b6 += 4*lda; + b7 += 4*lda; + b8 += 4*lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + buffer += 16; + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + + if (n & 4) { + { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + + buffer += 16; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + } + } //if (n & 4) + + if (n & 2) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else { + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else { + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + // buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/generic/zneg_tcopy_16.c b/kernel/generic/zneg_tcopy_16.c new file mode 100644 index 000000000..50f5a3d37 --- /dev/null +++ b/kernel/generic/zneg_tcopy_16.c @@ -0,0 +1,587 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 32; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + ctemp17 = *(aoffset1 + 16); + ctemp18 = *(aoffset1 + 17); + ctemp19 = *(aoffset1 + 18); + ctemp20 = *(aoffset1 + 19); + ctemp21 = *(aoffset1 + 20); + ctemp22 = *(aoffset1 + 21); + ctemp23 = *(aoffset1 + 22); + ctemp24 = *(aoffset1 + 23); + ctemp25 = *(aoffset1 + 24); + ctemp26 = *(aoffset1 + 25); + ctemp27 = *(aoffset1 + 26); + ctemp28 = *(aoffset1 + 27); + ctemp29 = *(aoffset1 + 28); + ctemp30 = *(aoffset1 + 29); + ctemp31 = *(aoffset1 + 30); + ctemp32 = *(aoffset1 + 31); + + ctemp33 = *(aoffset2 + 0); + ctemp34 = *(aoffset2 + 1); + ctemp35 = *(aoffset2 + 2); + ctemp36 = *(aoffset2 + 3); + ctemp37 = *(aoffset2 + 4); + ctemp38 = *(aoffset2 + 5); + ctemp39 = *(aoffset2 + 6); + ctemp40 = *(aoffset2 + 7); + ctemp41 = *(aoffset2 + 8); + ctemp42 = *(aoffset2 + 9); + ctemp43 = *(aoffset2 + 10); + ctemp44 = *(aoffset2 + 11); + ctemp45 = *(aoffset2 + 12); + ctemp46 = *(aoffset2 + 13); + ctemp47 = *(aoffset2 + 14); + ctemp48 = *(aoffset2 + 15); + ctemp49 = *(aoffset2 + 16); + ctemp50 = *(aoffset2 + 17); + ctemp51 = *(aoffset2 + 18); + ctemp52 = *(aoffset2 + 19); + ctemp53 = *(aoffset2 + 20); + ctemp54 = *(aoffset2 + 21); + ctemp55 = *(aoffset2 + 22); + ctemp56 = *(aoffset2 + 23); + ctemp57 = *(aoffset2 + 24); + ctemp58 = *(aoffset2 + 25); + ctemp59 = *(aoffset2 + 26); + ctemp60 = *(aoffset2 + 27); + ctemp61 = *(aoffset2 + 28); + ctemp62 = *(aoffset2 + 29); + ctemp63 = *(aoffset2 + 30); + ctemp64 = *(aoffset2 + 31); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + *(boffset + 32) = -ctemp33; + *(boffset + 33) = -ctemp34; + *(boffset + 34) = -ctemp35; + *(boffset + 35) = -ctemp36; + *(boffset + 36) = -ctemp37; + *(boffset + 37) = -ctemp38; + *(boffset + 38) = -ctemp39; + *(boffset + 39) = -ctemp40; + + *(boffset + 40) = -ctemp41; + *(boffset + 41) = -ctemp42; + *(boffset + 42) = -ctemp43; + *(boffset + 43) = -ctemp44; + *(boffset + 44) = -ctemp45; + *(boffset + 45) = -ctemp46; + *(boffset + 46) = -ctemp47; + *(boffset + 47) = -ctemp48; + + *(boffset + 48) = -ctemp49; + *(boffset + 49) = -ctemp50; + *(boffset + 50) = -ctemp51; + *(boffset + 51) = -ctemp52; + *(boffset + 52) = -ctemp53; + *(boffset + 53) = -ctemp54; + *(boffset + 54) = -ctemp55; + *(boffset + 55) = -ctemp56; + + *(boffset + 56) = -ctemp57; + *(boffset + 57) = -ctemp58; + *(boffset + 58) = -ctemp59; + *(boffset + 59) = -ctemp60; + *(boffset + 60) = -ctemp61; + *(boffset + 61) = -ctemp62; + *(boffset + 62) = -ctemp63; + *(boffset + 63) = -ctemp64; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 64; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + ctemp17 = *(aoffset1 + 16); + ctemp18 = *(aoffset1 + 17); + ctemp19 = *(aoffset1 + 18); + ctemp20 = *(aoffset1 + 19); + ctemp21 = *(aoffset1 + 20); + ctemp22 = *(aoffset1 + 21); + ctemp23 = *(aoffset1 + 22); + ctemp24 = *(aoffset1 + 23); + ctemp25 = *(aoffset1 + 24); + ctemp26 = *(aoffset1 + 25); + ctemp27 = *(aoffset1 + 26); + ctemp28 = *(aoffset1 + 27); + ctemp29 = *(aoffset1 + 28); + ctemp30 = *(aoffset1 + 29); + ctemp31 = *(aoffset1 + 30); + ctemp32 = *(aoffset1 + 31); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + boffset += 32; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + boffset += 16; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + boffset += 8; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + // aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + // boffset += 2; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_16.c b/kernel/generic/zsymm_lcopy_16.c new file mode 100644 index 000000000..b32374a5e --- /dev/null +++ b/kernel/generic/zsymm_lcopy_16.c @@ -0,0 +1,333 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; + if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; + if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; + if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; + if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; + if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; + if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; + if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; + + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + if (offset > -8) ao9 += lda; else ao9 += 2; + if (offset > -9) ao10 += lda; else ao10 += 2; + if (offset > -10) ao11 += lda; else ao11 += 2; + if (offset > -11) ao12 += lda; else ao12 += 2; + if (offset > -12) ao13 += lda; else ao13 += 2; + if (offset > -13) ao14 += lda; else ao14 += 2; + if (offset > -14) ao15 += lda; else ao15 += 2; + if (offset > -15) ao16 += lda; else ao16 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_16.c b/kernel/generic/zsymm_ucopy_16.c new file mode 100644 index 000000000..cb19bea47 --- /dev/null +++ b/kernel/generic/zsymm_ucopy_16.c @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; + if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; + if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; + if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; + if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; + if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; + if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; + if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + if (offset > -8) ao9 += 2; else ao9 += lda; + if (offset > -9) ao10 += 2; else ao10 += lda; + if (offset > -10) ao11 += 2; else ao11 += lda; + if (offset > -11) ao12 += 2; else ao12 += lda; + if (offset > -12) ao13 += 2; else ao13 += lda; + if (offset > -13) ao14 += 2; else ao14 += lda; + if (offset > -14) ao15 += 2; else ao15 += lda; + if (offset > -15) ao16 += 2; else ao16 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_16.c b/kernel/generic/ztrmm_lncopy_16.c new file mode 100644 index 000000000..d7fb23176 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_16.c @@ -0,0 +1,2310 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X < posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 512; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 2); + b[ 33] = *(a01 + 3); +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 4); + b[ 65] = *(a01 + 5); + b[ 66] = *(a02 + 4); + b[ 67] = *(a02 + 5); +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 6); + b[ 97] = *(a01 + 7); + b[ 98] = *(a02 + 6); + b[ 99] = *(a02 + 7); + b[100] = *(a03 + 6); + b[101] = *(a03 + 7); +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a01 + 8); + b[129] = *(a01 + 9); + b[130] = *(a02 + 8); + b[131] = *(a02 + 9); + b[132] = *(a03 + 8); + b[133] = *(a03 + 9); + b[134] = *(a04 + 8); + b[135] = *(a04 + 9); +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; + b[153] = ZERO; + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a01 + 10); + b[161] = *(a01 + 11); + b[162] = *(a02 + 10); + b[163] = *(a02 + 11); + b[164] = *(a03 + 10); + b[165] = *(a03 + 11); + b[166] = *(a04 + 10); + b[167] = *(a04 + 11); + b[168] = *(a05 + 10); + b[169] = *(a05 + 11); +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; + b[187] = ZERO; + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a01 + 12); + b[193] = *(a01 + 13); + b[194] = *(a02 + 12); + b[195] = *(a02 + 13); + b[196] = *(a03 + 12); + b[197] = *(a03 + 13); + b[198] = *(a04 + 12); + b[199] = *(a04 + 13); + b[200] = *(a05 + 12); + b[201] = *(a05 + 13); + b[202] = *(a06 + 12); + b[203] = *(a06 + 13); +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = ZERO; + b[207] = ZERO; + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; + b[221] = ZERO; + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a01 + 14); + b[225] = *(a01 + 15); + b[226] = *(a02 + 14); + b[227] = *(a02 + 15); + b[228] = *(a03 + 14); + b[229] = *(a03 + 15); + b[230] = *(a04 + 14); + b[231] = *(a04 + 15); + b[232] = *(a05 + 14); + b[233] = *(a05 + 15); + b[234] = *(a06 + 14); + b[235] = *(a06 + 15); + b[236] = *(a07 + 14); + b[237] = *(a07 + 15); +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; + b[255] = ZERO; + + b[256] = *(a01 + 16); + b[257] = *(a01 + 17); + b[258] = *(a02 + 16); + b[259] = *(a02 + 17); + b[260] = *(a03 + 16); + b[261] = *(a03 + 17); + b[262] = *(a04 + 16); + b[263] = *(a04 + 17); + b[264] = *(a05 + 16); + b[265] = *(a05 + 17); + b[266] = *(a06 + 16); + b[267] = *(a06 + 17); + b[268] = *(a07 + 16); + b[269] = *(a07 + 17); + b[270] = *(a08 + 16); + b[271] = *(a08 + 17); +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = ZERO; + b[275] = ZERO; + b[276] = ZERO; + b[277] = ZERO; + b[278] = ZERO; + b[279] = ZERO; + b[280] = ZERO; + b[281] = ZERO; + b[282] = ZERO; + b[283] = ZERO; + b[284] = ZERO; + b[285] = ZERO; + b[286] = ZERO; + b[287] = ZERO; + + b[288] = *(a01 + 18); + b[289] = *(a01 + 19); + b[290] = *(a02 + 18); + b[291] = *(a02 + 19); + b[292] = *(a03 + 18); + b[293] = *(a03 + 19); + b[294] = *(a04 + 18); + b[295] = *(a04 + 19); + b[296] = *(a05 + 18); + b[297] = *(a05 + 19); + b[298] = *(a06 + 18); + b[299] = *(a06 + 19); + b[300] = *(a07 + 18); + b[301] = *(a07 + 19); + b[302] = *(a08 + 18); + b[303] = *(a08 + 19); + b[304] = *(a09 + 18); + b[305] = *(a09 + 19); +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = ZERO; + b[309] = ZERO; + b[310] = ZERO; + b[311] = ZERO; + b[312] = ZERO; + b[313] = ZERO; + b[314] = ZERO; + b[315] = ZERO; + b[316] = ZERO; + b[317] = ZERO; + b[318] = ZERO; + b[319] = ZERO; + + b[320] = *(a01 + 20); + b[321] = *(a01 + 21); + b[322] = *(a02 + 20); + b[323] = *(a02 + 21); + b[324] = *(a03 + 20); + b[325] = *(a03 + 21); + b[326] = *(a04 + 20); + b[327] = *(a04 + 21); + b[328] = *(a05 + 20); + b[329] = *(a05 + 21); + b[330] = *(a06 + 20); + b[331] = *(a06 + 21); + b[332] = *(a07 + 20); + b[333] = *(a07 + 21); + b[334] = *(a08 + 20); + b[335] = *(a08 + 21); + b[336] = *(a09 + 20); + b[337] = *(a09 + 21); + b[338] = *(a10 + 20); + b[339] = *(a10 + 21); +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = ZERO; + b[343] = ZERO; + b[344] = ZERO; + b[345] = ZERO; + b[346] = ZERO; + b[347] = ZERO; + b[348] = ZERO; + b[349] = ZERO; + b[350] = ZERO; + b[351] = ZERO; + + b[352] = *(a01 + 22); + b[353] = *(a01 + 23); + b[354] = *(a02 + 22); + b[355] = *(a02 + 23); + b[356] = *(a03 + 22); + b[357] = *(a03 + 23); + b[358] = *(a04 + 22); + b[359] = *(a04 + 23); + b[360] = *(a05 + 22); + b[361] = *(a05 + 23); + b[362] = *(a06 + 22); + b[363] = *(a06 + 23); + b[364] = *(a07 + 22); + b[365] = *(a07 + 23); + b[366] = *(a08 + 22); + b[367] = *(a08 + 23); + b[368] = *(a09 + 22); + b[369] = *(a09 + 23); + b[370] = *(a10 + 22); + b[371] = *(a10 + 23); + b[372] = *(a11 + 22); + b[373] = *(a11 + 23); +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = ZERO; + b[377] = ZERO; + b[378] = ZERO; + b[379] = ZERO; + b[380] = ZERO; + b[381] = ZERO; + b[382] = ZERO; + b[383] = ZERO; + + b[384] = *(a01 + 24); + b[385] = *(a01 + 25); + b[386] = *(a02 + 24); + b[387] = *(a02 + 25); + b[388] = *(a03 + 24); + b[389] = *(a03 + 25); + b[390] = *(a04 + 24); + b[391] = *(a04 + 25); + b[392] = *(a05 + 24); + b[393] = *(a05 + 25); + b[394] = *(a06 + 24); + b[395] = *(a06 + 25); + b[396] = *(a07 + 24); + b[397] = *(a07 + 25); + b[398] = *(a08 + 24); + b[399] = *(a08 + 25); + b[400] = *(a09 + 24); + b[401] = *(a09 + 25); + b[402] = *(a10 + 24); + b[403] = *(a10 + 25); + b[404] = *(a11 + 24); + b[405] = *(a11 + 25); + b[406] = *(a12 + 24); + b[407] = *(a12 + 25); +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = ZERO; + b[411] = ZERO; + b[412] = ZERO; + b[413] = ZERO; + b[414] = ZERO; + b[415] = ZERO; + + b[416] = *(a01 + 26); + b[417] = *(a01 + 27); + b[418] = *(a02 + 26); + b[419] = *(a02 + 27); + b[420] = *(a03 + 26); + b[421] = *(a03 + 27); + b[422] = *(a04 + 26); + b[423] = *(a04 + 27); + b[424] = *(a05 + 26); + b[425] = *(a05 + 27); + b[426] = *(a06 + 26); + b[427] = *(a06 + 27); + b[428] = *(a07 + 26); + b[429] = *(a07 + 27); + b[430] = *(a08 + 26); + b[431] = *(a08 + 27); + b[432] = *(a09 + 26); + b[433] = *(a09 + 27); + b[434] = *(a10 + 26); + b[435] = *(a10 + 27); + b[436] = *(a11 + 26); + b[437] = *(a11 + 27); + b[438] = *(a12 + 26); + b[439] = *(a12 + 27); + b[440] = *(a13 + 26); + b[441] = *(a13 + 27); +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = ZERO; + b[445] = ZERO; + b[446] = ZERO; + b[447] = ZERO; + + b[448] = *(a01 + 28); + b[449] = *(a01 + 29); + b[450] = *(a02 + 28); + b[451] = *(a02 + 29); + b[452] = *(a03 + 28); + b[453] = *(a03 + 29); + b[454] = *(a04 + 28); + b[455] = *(a04 + 29); + b[456] = *(a05 + 28); + b[457] = *(a05 + 29); + b[458] = *(a06 + 28); + b[459] = *(a06 + 29); + b[460] = *(a07 + 28); + b[461] = *(a07 + 29); + b[462] = *(a08 + 28); + b[463] = *(a08 + 29); + b[464] = *(a09 + 28); + b[465] = *(a09 + 29); + b[466] = *(a10 + 28); + b[467] = *(a10 + 29); + b[468] = *(a11 + 28); + b[469] = *(a11 + 29); + b[470] = *(a12 + 28); + b[471] = *(a12 + 29); + b[472] = *(a13 + 28); + b[473] = *(a13 + 29); + b[474] = *(a14 + 28); + b[475] = *(a14 + 29); +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = ZERO; + b[479] = ZERO; + + b[480] = *(a01 + 30); + b[481] = *(a01 + 31); + b[482] = *(a02 + 30); + b[483] = *(a02 + 31); + b[484] = *(a03 + 30); + b[485] = *(a03 + 31); + b[486] = *(a04 + 30); + b[487] = *(a04 + 31); + b[488] = *(a05 + 30); + b[489] = *(a05 + 31); + b[490] = *(a06 + 30); + b[491] = *(a06 + 31); + b[492] = *(a07 + 30); + b[493] = *(a07 + 31); + b[494] = *(a08 + 30); + b[495] = *(a08 + 31); + b[496] = *(a09 + 30); + b[497] = *(a09 + 31); + b[498] = *(a10 + 30); + b[499] = *(a10 + 31); + b[500] = *(a11 + 30); + b[501] = *(a11 + 31); + b[502] = *(a12 + 30); + b[503] = *(a12 + 31); + b[504] = *(a13 + 30); + b[505] = *(a13 + 31); + b[506] = *(a14 + 30); + b[507] = *(a14 + 31); + b[508] = *(a15 + 30); + b[509] = *(a15 + 31); +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 4) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a01 + 7); + b[ 2] = *(a02 + 6); + b[ 3] = *(a02 + 7); + b[ 4] = *(a03 + 6); + b[ 5] = *(a03 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 5) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a01 + 9); + b[ 2] = *(a02 + 8); + b[ 3] = *(a02 + 9); + b[ 4] = *(a03 + 8); + b[ 5] = *(a03 + 9); + b[ 6] = *(a04 + 8); + b[ 7] = *(a04 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 6) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a01 + 11); + b[ 2] = *(a02 + 10); + b[ 3] = *(a02 + 11); + b[ 4] = *(a03 + 10); + b[ 5] = *(a03 + 11); + b[ 6] = *(a04 + 10); + b[ 7] = *(a04 + 11); + b[ 8] = *(a05 + 10); + b[ 9] = *(a05 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 7) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a01 + 13); + b[ 2] = *(a02 + 12); + b[ 3] = *(a02 + 13); + b[ 4] = *(a03 + 12); + b[ 5] = *(a03 + 13); + b[ 6] = *(a04 + 12); + b[ 7] = *(a04 + 13); + b[ 8] = *(a05 + 12); + b[ 9] = *(a05 + 13); + b[10] = *(a06 + 12); + b[11] = *(a06 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 8) { + b[ 0] = *(a01 + 14); + b[ 1] = *(a01 + 15); + b[ 2] = *(a02 + 14); + b[ 3] = *(a02 + 15); + b[ 4] = *(a03 + 14); + b[ 5] = *(a03 + 15); + b[ 6] = *(a04 + 14); + b[ 7] = *(a04 + 15); + b[ 8] = *(a05 + 14); + b[ 9] = *(a05 + 15); + b[ 10] = *(a06 + 14); + b[ 11] = *(a06 + 15); + b[ 12] = *(a07 + 14); + b[ 13] = *(a07 + 15); +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 9) { + b[ 0] = *(a01 + 16); + b[ 1] = *(a01 + 17); + b[ 2] = *(a02 + 16); + b[ 3] = *(a02 + 17); + b[ 4] = *(a03 + 16); + b[ 5] = *(a03 + 17); + b[ 6] = *(a04 + 16); + b[ 7] = *(a04 + 17); + b[ 8] = *(a05 + 16); + b[ 9] = *(a05 + 17); + b[ 10] = *(a06 + 16); + b[ 11] = *(a06 + 17); + b[ 12] = *(a07 + 16); + b[ 13] = *(a07 + 17); + b[ 14] = *(a08 + 16); + b[ 15] = *(a08 + 17); +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 10) { + b[ 0] = *(a01 + 18); + b[ 1] = *(a01 + 19); + b[ 2] = *(a02 + 18); + b[ 3] = *(a02 + 19); + b[ 4] = *(a03 + 18); + b[ 5] = *(a03 + 19); + b[ 6] = *(a04 + 18); + b[ 7] = *(a04 + 19); + b[ 8] = *(a05 + 18); + b[ 9] = *(a05 + 19); + b[ 10] = *(a06 + 18); + b[ 11] = *(a06 + 19); + b[ 12] = *(a07 + 18); + b[ 13] = *(a07 + 19); + b[ 14] = *(a08 + 18); + b[ 15] = *(a08 + 19); + b[ 16] = *(a09 + 18); + b[ 17] = *(a09 + 19); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 11) { + b[ 0] = *(a01 + 20); + b[ 1] = *(a01 + 21); + b[ 2] = *(a02 + 20); + b[ 3] = *(a02 + 21); + b[ 4] = *(a03 + 20); + b[ 5] = *(a03 + 21); + b[ 6] = *(a04 + 20); + b[ 7] = *(a04 + 21); + b[ 8] = *(a05 + 20); + b[ 9] = *(a05 + 21); + b[ 10] = *(a06 + 20); + b[ 11] = *(a06 + 21); + b[ 12] = *(a07 + 20); + b[ 13] = *(a07 + 21); + b[ 14] = *(a08 + 20); + b[ 15] = *(a08 + 21); + b[ 16] = *(a09 + 20); + b[ 17] = *(a09 + 21); + b[ 18] = *(a10 + 20); + b[ 19] = *(a10 + 21); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 12) { + b[ 0] = *(a01 + 22); + b[ 1] = *(a01 + 23); + b[ 2] = *(a02 + 22); + b[ 3] = *(a02 + 23); + b[ 4] = *(a03 + 22); + b[ 5] = *(a03 + 23); + b[ 6] = *(a04 + 22); + b[ 7] = *(a04 + 23); + b[ 8] = *(a05 + 22); + b[ 9] = *(a05 + 23); + b[ 10] = *(a06 + 22); + b[ 11] = *(a06 + 23); + b[ 12] = *(a07 + 22); + b[ 13] = *(a07 + 23); + b[ 14] = *(a08 + 22); + b[ 15] = *(a08 + 23); + b[ 16] = *(a09 + 22); + b[ 17] = *(a09 + 23); + b[ 18] = *(a10 + 22); + b[ 19] = *(a10 + 23); + b[ 20] = *(a11 + 22); + b[ 21] = *(a11 + 23); +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 13) { + b[ 0] = *(a01 + 24); + b[ 1] = *(a01 + 25); + b[ 2] = *(a02 + 24); + b[ 3] = *(a02 + 25); + b[ 4] = *(a03 + 24); + b[ 5] = *(a03 + 25); + b[ 6] = *(a04 + 24); + b[ 7] = *(a04 + 25); + b[ 8] = *(a05 + 24); + b[ 9] = *(a05 + 25); + b[ 10] = *(a06 + 24); + b[ 11] = *(a06 + 25); + b[ 12] = *(a07 + 24); + b[ 13] = *(a07 + 25); + b[ 14] = *(a08 + 24); + b[ 15] = *(a08 + 25); + b[ 16] = *(a09 + 24); + b[ 17] = *(a09 + 25); + b[ 18] = *(a10 + 24); + b[ 19] = *(a10 + 25); + b[ 20] = *(a11 + 24); + b[ 21] = *(a11 + 25); + b[ 22] = *(a12 + 24); + b[ 23] = *(a12 + 25); +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 14) { + b[ 0] = *(a01 + 26); + b[ 1] = *(a01 + 27); + b[ 2] = *(a02 + 26); + b[ 3] = *(a02 + 27); + b[ 4] = *(a03 + 26); + b[ 5] = *(a03 + 27); + b[ 6] = *(a04 + 26); + b[ 7] = *(a04 + 27); + b[ 8] = *(a05 + 26); + b[ 9] = *(a05 + 27); + b[ 10] = *(a06 + 26); + b[ 11] = *(a06 + 27); + b[ 12] = *(a07 + 26); + b[ 13] = *(a07 + 27); + b[ 14] = *(a08 + 26); + b[ 15] = *(a08 + 27); + b[ 16] = *(a09 + 26); + b[ 17] = *(a09 + 27); + b[ 18] = *(a10 + 26); + b[ 19] = *(a10 + 27); + b[ 20] = *(a11 + 26); + b[ 21] = *(a11 + 27); + b[ 22] = *(a12 + 26); + b[ 23] = *(a12 + 27); + b[ 24] = *(a13 + 26); + b[ 25] = *(a13 + 27); +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 15) { + b[ 0] = *(a01 + 28); + b[ 1] = *(a01 + 29); + b[ 2] = *(a02 + 28); + b[ 3] = *(a02 + 29); + b[ 4] = *(a03 + 28); + b[ 5] = *(a03 + 29); + b[ 6] = *(a04 + 28); + b[ 7] = *(a04 + 29); + b[ 8] = *(a05 + 28); + b[ 9] = *(a05 + 29); + b[ 10] = *(a06 + 28); + b[ 11] = *(a06 + 29); + b[ 12] = *(a07 + 28); + b[ 13] = *(a07 + 29); + b[ 14] = *(a08 + 28); + b[ 15] = *(a08 + 29); + b[ 16] = *(a09 + 28); + b[ 17] = *(a09 + 29); + b[ 18] = *(a10 + 28); + b[ 19] = *(a10 + 29); + b[ 20] = *(a11 + 28); + b[ 21] = *(a11 + 29); + b[ 22] = *(a12 + 28); + b[ 23] = *(a12 + 29); + b[ 24] = *(a13 + 28); + b[ 25] = *(a13 + 29); + b[ 26] = *(a14 + 28); + b[ 27] = *(a14 + 29); +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X < posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 128; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 2); + b[ 17] = *(a01 + 3); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 4); + b[ 33] = *(a01 + 5); + b[ 34] = *(a02 + 4); + b[ 35] = *(a02 + 5); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a01 + 6); + b[ 49] = *(a01 + 7); + b[ 50] = *(a02 + 6); + b[ 51] = *(a02 + 7); + b[ 52] = *(a03 + 6); + b[ 53] = *(a03 + 7); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 8); + b[ 65] = *(a01 + 9); + b[ 66] = *(a02 + 8); + b[ 67] = *(a02 + 9); + b[ 68] = *(a03 + 8); + b[ 69] = *(a03 + 9); + b[ 70] = *(a04 + 8); + b[ 71] = *(a04 + 9); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a01 + 10); + b[ 81] = *(a01 + 11); + b[ 82] = *(a02 + 10); + b[ 83] = *(a02 + 11); + b[ 84] = *(a03 + 10); + b[ 85] = *(a03 + 11); + b[ 86] = *(a04 + 10); + b[ 87] = *(a04 + 11); + b[ 88] = *(a05 + 10); + b[ 89] = *(a05 + 11); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 12); + b[ 97] = *(a01 + 13); + b[ 98] = *(a02 + 12); + b[ 99] = *(a02 + 13); + b[100] = *(a03 + 12); + b[101] = *(a03 + 13); + b[102] = *(a04 + 12); + b[103] = *(a04 + 13); + b[104] = *(a05 + 12); + b[105] = *(a05 + 13); + b[106] = *(a06 + 12); + b[107] = *(a06 + 13); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a01 + 14); + b[113] = *(a01 + 15); + b[114] = *(a02 + 14); + b[115] = *(a02 + 15); + b[116] = *(a03 + 14); + b[117] = *(a03 + 15); + b[118] = *(a04 + 14); + b[119] = *(a04 + 15); + b[120] = *(a05 + 14); + b[121] = *(a05 + 15); + b[122] = *(a06 + 14); + b[123] = *(a06 + 15); + b[124] = *(a07 + 14); + b[125] = *(a07 + 15); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; */ + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a01 + 7); + b[ 2] = *(a02 + 6); + b[ 3] = *(a02 + 7); + b[ 4] = *(a03 + 6); + b[ 5] = *(a03 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a01 + 9); + b[ 2] = *(a02 + 8); + b[ 3] = *(a02 + 9); + b[ 4] = *(a03 + 8); + b[ 5] = *(a03 + 9); + b[ 6] = *(a04 + 8); + b[ 7] = *(a04 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a01 + 11); + b[ 2] = *(a02 + 10); + b[ 3] = *(a02 + 11); + b[ 4] = *(a03 + 10); + b[ 5] = *(a03 + 11); + b[ 6] = *(a04 + 10); + b[ 7] = *(a04 + 11); + b[ 8] = *(a05 + 10); + b[ 9] = *(a05 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a01 + 13); + b[ 2] = *(a02 + 12); + b[ 3] = *(a02 + 13); + b[ 4] = *(a03 + 12); + b[ 5] = *(a03 + 13); + b[ 6] = *(a04 + 12); + b[ 7] = *(a04 + 13); + b[ 8] = *(a05 + 12); + b[ 9] = *(a05 + 13); + b[10] = *(a06 + 12); + b[11] = *(a06 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X < posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a01 + 2); + b[ 9] = *(a01 + 3); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 4); + b[ 17] = *(a01 + 5); + b[ 18] = *(a02 + 4); + b[ 19] = *(a02 + 5); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a01 + 6); + b[ 25] = *(a01 + 7); + b[ 26] = *(a02 + 6); + b[ 27] = *(a02 + 7); + b[ 28] = *(a03 + 6); + b[ 29] = *(a03 + 7); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; */ + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X < posY) { + /* a01 += 2 * lda; + a02 += 2 * lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += 2; + b += 2; + } else + if (X < posY) { + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_16.c b/kernel/generic/ztrmm_ltcopy_16.c new file mode 100644 index 000000000..8d585e70b --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_16.c @@ -0,0 +1,2313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } else + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + b += 32; + } + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = *(a02 + 4); + b[ 37] = *(a02 + 5); + b[ 38] = *(a02 + 6); + b[ 39] = *(a02 + 7); + b[ 40] = *(a02 + 8); + b[ 41] = *(a02 + 9); + b[ 42] = *(a02 + 10); + b[ 43] = *(a02 + 11); + b[ 44] = *(a02 + 12); + b[ 45] = *(a02 + 13); + b[ 46] = *(a02 + 14); + b[ 47] = *(a02 + 15); + b[ 48] = *(a02 + 16); + b[ 49] = *(a02 + 17); + b[ 50] = *(a02 + 18); + b[ 51] = *(a02 + 19); + b[ 52] = *(a02 + 20); + b[ 53] = *(a02 + 21); + b[ 54] = *(a02 + 22); + b[ 55] = *(a02 + 23); + b[ 56] = *(a02 + 24); + b[ 57] = *(a02 + 25); + b[ 58] = *(a02 + 26); + b[ 59] = *(a02 + 27); + b[ 60] = *(a02 + 28); + b[ 61] = *(a02 + 29); + b[ 62] = *(a02 + 30); + b[ 63] = *(a02 + 31); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = *(a03 + 6); + b[ 71] = *(a03 + 7); + b[ 72] = *(a03 + 8); + b[ 73] = *(a03 + 9); + b[ 74] = *(a03 + 10); + b[ 75] = *(a03 + 11); + b[ 76] = *(a03 + 12); + b[ 77] = *(a03 + 13); + b[ 78] = *(a03 + 14); + b[ 79] = *(a03 + 15); + b[ 80] = *(a03 + 16); + b[ 81] = *(a03 + 17); + b[ 82] = *(a03 + 18); + b[ 83] = *(a03 + 19); + b[ 84] = *(a03 + 20); + b[ 85] = *(a03 + 21); + b[ 86] = *(a03 + 22); + b[ 87] = *(a03 + 23); + b[ 88] = *(a03 + 24); + b[ 89] = *(a03 + 25); + b[ 90] = *(a03 + 26); + b[ 91] = *(a03 + 27); + b[ 92] = *(a03 + 28); + b[ 93] = *(a03 + 29); + b[ 94] = *(a03 + 30); + b[ 95] = *(a03 + 31); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = *(a04 + 8); + b[105] = *(a04 + 9); + b[106] = *(a04 + 10); + b[107] = *(a04 + 11); + b[108] = *(a04 + 12); + b[109] = *(a04 + 13); + b[110] = *(a04 + 14); + b[111] = *(a04 + 15); + b[112] = *(a04 + 16); + b[113] = *(a04 + 17); + b[114] = *(a04 + 18); + b[115] = *(a04 + 19); + b[116] = *(a04 + 20); + b[117] = *(a04 + 21); + b[118] = *(a04 + 22); + b[119] = *(a04 + 23); + b[120] = *(a04 + 24); + b[121] = *(a04 + 25); + b[122] = *(a04 + 26); + b[123] = *(a04 + 27); + b[124] = *(a04 + 28); + b[125] = *(a04 + 29); + b[126] = *(a04 + 30); + b[127] = *(a04 + 31); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = *(a05 + 10); + b[139] = *(a05 + 11); + b[140] = *(a05 + 12); + b[141] = *(a05 + 13); + b[142] = *(a05 + 14); + b[143] = *(a05 + 15); + b[144] = *(a05 + 16); + b[145] = *(a05 + 17); + b[146] = *(a05 + 18); + b[147] = *(a05 + 19); + b[148] = *(a05 + 20); + b[149] = *(a05 + 21); + b[150] = *(a05 + 22); + b[151] = *(a05 + 23); + b[152] = *(a05 + 24); + b[153] = *(a05 + 25); + b[154] = *(a05 + 26); + b[155] = *(a05 + 27); + b[156] = *(a05 + 28); + b[157] = *(a05 + 29); + b[158] = *(a05 + 30); + b[159] = *(a05 + 31); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = *(a06 + 12); + b[173] = *(a06 + 13); + b[174] = *(a06 + 14); + b[175] = *(a06 + 15); + b[176] = *(a06 + 16); + b[177] = *(a06 + 17); + b[178] = *(a06 + 18); + b[179] = *(a06 + 19); + b[180] = *(a06 + 20); + b[181] = *(a06 + 21); + b[182] = *(a06 + 22); + b[183] = *(a06 + 23); + b[184] = *(a06 + 24); + b[185] = *(a06 + 25); + b[186] = *(a06 + 26); + b[187] = *(a06 + 27); + b[188] = *(a06 + 28); + b[189] = *(a06 + 29); + b[190] = *(a06 + 30); + b[191] = *(a06 + 31); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = *(a07 + 14); + b[207] = *(a07 + 15); + b[208] = *(a07 + 16); + b[209] = *(a07 + 17); + b[210] = *(a07 + 18); + b[211] = *(a07 + 19); + b[212] = *(a07 + 20); + b[213] = *(a07 + 21); + b[214] = *(a07 + 22); + b[215] = *(a07 + 23); + b[216] = *(a07 + 24); + b[217] = *(a07 + 25); + b[218] = *(a07 + 26); + b[219] = *(a07 + 27); + b[220] = *(a07 + 28); + b[221] = *(a07 + 29); + b[222] = *(a07 + 30); + b[223] = *(a07 + 31); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = *(a08 + 16); + b[241] = *(a08 + 17); + b[242] = *(a08 + 18); + b[243] = *(a08 + 19); + b[244] = *(a08 + 20); + b[245] = *(a08 + 21); + b[246] = *(a08 + 22); + b[247] = *(a08 + 23); + b[248] = *(a08 + 24); + b[249] = *(a08 + 25); + b[250] = *(a08 + 26); + b[251] = *(a08 + 27); + b[252] = *(a08 + 28); + b[253] = *(a08 + 29); + b[254] = *(a08 + 30); + b[255] = *(a08 + 31); + + b[256] = ZERO; + b[257] = ZERO; + b[258] = ZERO; + b[259] = ZERO; + b[260] = ZERO; + b[261] = ZERO; + b[262] = ZERO; + b[263] = ZERO; + b[264] = ZERO; + b[265] = ZERO; + b[266] = ZERO; + b[267] = ZERO; + b[268] = ZERO; + b[269] = ZERO; + b[270] = ZERO; + b[271] = ZERO; +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = *(a09 + 18); + b[275] = *(a09 + 19); + b[276] = *(a09 + 20); + b[277] = *(a09 + 21); + b[278] = *(a09 + 22); + b[279] = *(a09 + 23); + b[280] = *(a09 + 24); + b[281] = *(a09 + 25); + b[282] = *(a09 + 26); + b[283] = *(a09 + 27); + b[284] = *(a09 + 28); + b[285] = *(a09 + 29); + b[286] = *(a09 + 30); + b[287] = *(a09 + 31); + + b[288] = ZERO; + b[289] = ZERO; + b[290] = ZERO; + b[291] = ZERO; + b[292] = ZERO; + b[293] = ZERO; + b[294] = ZERO; + b[295] = ZERO; + b[296] = ZERO; + b[297] = ZERO; + b[298] = ZERO; + b[299] = ZERO; + b[300] = ZERO; + b[301] = ZERO; + b[302] = ZERO; + b[303] = ZERO; + b[304] = ZERO; + b[305] = ZERO; +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = *(a10 + 20); + b[309] = *(a10 + 21); + b[310] = *(a10 + 22); + b[311] = *(a10 + 23); + b[312] = *(a10 + 24); + b[313] = *(a10 + 25); + b[314] = *(a10 + 26); + b[315] = *(a10 + 27); + b[316] = *(a10 + 28); + b[317] = *(a10 + 29); + b[318] = *(a10 + 30); + b[319] = *(a10 + 31); + + b[320] = ZERO; + b[321] = ZERO; + b[322] = ZERO; + b[323] = ZERO; + b[324] = ZERO; + b[325] = ZERO; + b[326] = ZERO; + b[327] = ZERO; + b[328] = ZERO; + b[329] = ZERO; + b[330] = ZERO; + b[331] = ZERO; + b[332] = ZERO; + b[333] = ZERO; + b[334] = ZERO; + b[335] = ZERO; + b[336] = ZERO; + b[337] = ZERO; + b[338] = ZERO; + b[339] = ZERO; +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = *(a11 + 22); + b[343] = *(a11 + 23); + b[344] = *(a11 + 24); + b[345] = *(a11 + 25); + b[346] = *(a11 + 26); + b[347] = *(a11 + 27); + b[348] = *(a11 + 28); + b[349] = *(a11 + 29); + b[350] = *(a11 + 30); + b[351] = *(a11 + 31); + + b[352] = ZERO; + b[353] = ZERO; + b[354] = ZERO; + b[355] = ZERO; + b[356] = ZERO; + b[357] = ZERO; + b[358] = ZERO; + b[359] = ZERO; + b[360] = ZERO; + b[361] = ZERO; + b[362] = ZERO; + b[363] = ZERO; + b[364] = ZERO; + b[365] = ZERO; + b[366] = ZERO; + b[367] = ZERO; + b[368] = ZERO; + b[369] = ZERO; + b[370] = ZERO; + b[371] = ZERO; + b[372] = ZERO; + b[373] = ZERO; +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = *(a12 + 24); + b[377] = *(a12 + 25); + b[378] = *(a12 + 26); + b[379] = *(a12 + 27); + b[380] = *(a12 + 28); + b[381] = *(a12 + 29); + b[382] = *(a12 + 30); + b[383] = *(a12 + 31); + + b[384] = ZERO; + b[385] = ZERO; + b[386] = ZERO; + b[387] = ZERO; + b[388] = ZERO; + b[389] = ZERO; + b[390] = ZERO; + b[391] = ZERO; + b[392] = ZERO; + b[393] = ZERO; + b[394] = ZERO; + b[395] = ZERO; + b[396] = ZERO; + b[397] = ZERO; + b[398] = ZERO; + b[399] = ZERO; + b[400] = ZERO; + b[401] = ZERO; + b[402] = ZERO; + b[403] = ZERO; + b[404] = ZERO; + b[405] = ZERO; + b[406] = ZERO; + b[407] = ZERO; +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = *(a13 + 26); + b[411] = *(a13 + 27); + b[412] = *(a13 + 28); + b[413] = *(a13 + 29); + b[414] = *(a13 + 30); + b[415] = *(a13 + 31); + + b[416] = ZERO; + b[417] = ZERO; + b[418] = ZERO; + b[419] = ZERO; + b[420] = ZERO; + b[421] = ZERO; + b[422] = ZERO; + b[423] = ZERO; + b[424] = ZERO; + b[425] = ZERO; + b[426] = ZERO; + b[427] = ZERO; + b[428] = ZERO; + b[429] = ZERO; + b[430] = ZERO; + b[431] = ZERO; + b[432] = ZERO; + b[433] = ZERO; + b[434] = ZERO; + b[435] = ZERO; + b[436] = ZERO; + b[437] = ZERO; + b[438] = ZERO; + b[439] = ZERO; + b[440] = ZERO; + b[441] = ZERO; +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = *(a14 + 28); + b[445] = *(a14 + 29); + b[446] = *(a14 + 30); + b[447] = *(a14 + 31); + + b[448] = ZERO; + b[449] = ZERO; + b[450] = ZERO; + b[451] = ZERO; + b[452] = ZERO; + b[453] = ZERO; + b[454] = ZERO; + b[455] = ZERO; + b[456] = ZERO; + b[457] = ZERO; + b[458] = ZERO; + b[459] = ZERO; + b[460] = ZERO; + b[461] = ZERO; + b[462] = ZERO; + b[463] = ZERO; + b[464] = ZERO; + b[465] = ZERO; + b[466] = ZERO; + b[467] = ZERO; + b[468] = ZERO; + b[469] = ZERO; + b[470] = ZERO; + b[471] = ZERO; + b[472] = ZERO; + b[473] = ZERO; + b[474] = ZERO; + b[475] = ZERO; +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = *(a15 + 30); + b[479] = *(a15 + 31); + + b[480] = ZERO; + b[481] = ZERO; + b[482] = ZERO; + b[483] = ZERO; + b[484] = ZERO; + b[485] = ZERO; + b[486] = ZERO; + b[487] = ZERO; + b[488] = ZERO; + b[489] = ZERO; + b[490] = ZERO; + b[491] = ZERO; + b[492] = ZERO; + b[493] = ZERO; + b[494] = ZERO; + b[495] = ZERO; + b[496] = ZERO; + b[497] = ZERO; + b[498] = ZERO; + b[499] = ZERO; + b[500] = ZERO; + b[501] = ZERO; + b[502] = ZERO; + b[503] = ZERO; + b[504] = ZERO; + b[505] = ZERO; + b[506] = ZERO; + b[507] = ZERO; + b[508] = ZERO; + b[509] = ZERO; +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 32; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + b += 32; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[ 10] = *(a02 + 10); + b[ 11] = *(a02 + 11); + b[ 12] = *(a02 + 12); + b[ 13] = *(a02 + 13); + b[ 14] = *(a02 + 14); + b[ 15] = *(a02 + 15); + b[ 16] = *(a02 + 16); + b[ 17] = *(a02 + 17); + b[ 18] = *(a02 + 18); + b[ 19] = *(a02 + 19); + b[ 20] = *(a02 + 20); + b[ 21] = *(a02 + 21); + b[ 22] = *(a02 + 22); + b[ 23] = *(a02 + 23); + b[ 24] = *(a02 + 24); + b[ 25] = *(a02 + 25); + b[ 26] = *(a02 + 26); + b[ 27] = *(a02 + 27); + b[ 28] = *(a02 + 28); + b[ 29] = *(a02 + 29); + b[ 30] = *(a02 + 30); + b[ 31] = *(a02 + 31); + b += 32; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[ 10] = *(a03 + 10); + b[ 11] = *(a03 + 11); + b[ 12] = *(a03 + 12); + b[ 13] = *(a03 + 13); + b[ 14] = *(a03 + 14); + b[ 15] = *(a03 + 15); + b[ 16] = *(a03 + 16); + b[ 17] = *(a03 + 17); + b[ 18] = *(a03 + 18); + b[ 19] = *(a03 + 19); + b[ 20] = *(a03 + 20); + b[ 21] = *(a03 + 21); + b[ 22] = *(a03 + 22); + b[ 23] = *(a03 + 23); + b[ 24] = *(a03 + 24); + b[ 25] = *(a03 + 25); + b[ 26] = *(a03 + 26); + b[ 27] = *(a03 + 27); + b[ 28] = *(a03 + 28); + b[ 29] = *(a03 + 29); + b[ 30] = *(a03 + 30); + b[ 31] = *(a03 + 31); + b += 32; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[ 10] = *(a04 + 10); + b[ 11] = *(a04 + 11); + b[ 12] = *(a04 + 12); + b[ 13] = *(a04 + 13); + b[ 14] = *(a04 + 14); + b[ 15] = *(a04 + 15); + b[ 16] = *(a04 + 16); + b[ 17] = *(a04 + 17); + b[ 18] = *(a04 + 18); + b[ 19] = *(a04 + 19); + b[ 20] = *(a04 + 20); + b[ 21] = *(a04 + 21); + b[ 22] = *(a04 + 22); + b[ 23] = *(a04 + 23); + b[ 24] = *(a04 + 24); + b[ 25] = *(a04 + 25); + b[ 26] = *(a04 + 26); + b[ 27] = *(a04 + 27); + b[ 28] = *(a04 + 28); + b[ 29] = *(a04 + 29); + b[ 30] = *(a04 + 30); + b[ 31] = *(a04 + 31); + b += 32; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = *(a05 + 10); + b[ 11] = *(a05 + 11); + b[ 12] = *(a05 + 12); + b[ 13] = *(a05 + 13); + b[ 14] = *(a05 + 14); + b[ 15] = *(a05 + 15); + b[ 16] = *(a05 + 16); + b[ 17] = *(a05 + 17); + b[ 18] = *(a05 + 18); + b[ 19] = *(a05 + 19); + b[ 20] = *(a05 + 20); + b[ 21] = *(a05 + 21); + b[ 22] = *(a05 + 22); + b[ 23] = *(a05 + 23); + b[ 24] = *(a05 + 24); + b[ 25] = *(a05 + 25); + b[ 26] = *(a05 + 26); + b[ 27] = *(a05 + 27); + b[ 28] = *(a05 + 28); + b[ 29] = *(a05 + 29); + b[ 30] = *(a05 + 30); + b[ 31] = *(a05 + 31); + b += 32; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = *(a06 + 12); + b[ 13] = *(a06 + 13); + b[ 14] = *(a06 + 14); + b[ 15] = *(a06 + 15); + b[ 16] = *(a06 + 16); + b[ 17] = *(a06 + 17); + b[ 18] = *(a06 + 18); + b[ 19] = *(a06 + 19); + b[ 20] = *(a06 + 20); + b[ 21] = *(a06 + 21); + b[ 22] = *(a06 + 22); + b[ 23] = *(a06 + 23); + b[ 24] = *(a06 + 24); + b[ 25] = *(a06 + 25); + b[ 26] = *(a06 + 26); + b[ 27] = *(a06 + 27); + b[ 28] = *(a06 + 28); + b[ 29] = *(a06 + 29); + b[ 30] = *(a06 + 30); + b[ 31] = *(a06 + 31); + b += 32; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = *(a07 + 14); + b[ 15] = *(a07 + 15); + b[ 16] = *(a07 + 16); + b[ 17] = *(a07 + 17); + b[ 18] = *(a07 + 18); + b[ 19] = *(a07 + 19); + b[ 20] = *(a07 + 20); + b[ 21] = *(a07 + 21); + b[ 22] = *(a07 + 22); + b[ 23] = *(a07 + 23); + b[ 24] = *(a07 + 24); + b[ 25] = *(a07 + 25); + b[ 26] = *(a07 + 26); + b[ 27] = *(a07 + 27); + b[ 28] = *(a07 + 28); + b[ 29] = *(a07 + 29); + b[ 30] = *(a07 + 30); + b[ 31] = *(a07 + 31); + b += 32; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = *(a08 + 16); + b[ 17] = *(a08 + 17); + b[ 18] = *(a08 + 18); + b[ 19] = *(a08 + 19); + b[ 20] = *(a08 + 20); + b[ 21] = *(a08 + 21); + b[ 22] = *(a08 + 22); + b[ 23] = *(a08 + 23); + b[ 24] = *(a08 + 24); + b[ 25] = *(a08 + 25); + b[ 26] = *(a08 + 26); + b[ 27] = *(a08 + 27); + b[ 28] = *(a08 + 28); + b[ 29] = *(a08 + 29); + b[ 30] = *(a08 + 30); + b[ 31] = *(a08 + 31); + b += 32; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = *(a09 + 18); + b[ 19] = *(a09 + 19); + b[ 20] = *(a09 + 20); + b[ 21] = *(a09 + 21); + b[ 22] = *(a09 + 22); + b[ 23] = *(a09 + 23); + b[ 24] = *(a09 + 24); + b[ 25] = *(a09 + 25); + b[ 26] = *(a09 + 26); + b[ 27] = *(a09 + 27); + b[ 28] = *(a09 + 28); + b[ 29] = *(a09 + 29); + b[ 30] = *(a09 + 30); + b[ 31] = *(a09 + 31); + b += 32; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = *(a10 + 20); + b[ 21] = *(a10 + 21); + b[ 22] = *(a10 + 22); + b[ 23] = *(a10 + 23); + b[ 24] = *(a10 + 24); + b[ 25] = *(a10 + 25); + b[ 26] = *(a10 + 26); + b[ 27] = *(a10 + 27); + b[ 28] = *(a10 + 28); + b[ 29] = *(a10 + 29); + b[ 30] = *(a10 + 30); + b[ 31] = *(a10 + 31); + b += 32; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = *(a11 + 22); + b[ 23] = *(a11 + 23); + b[ 24] = *(a11 + 24); + b[ 25] = *(a11 + 25); + b[ 26] = *(a11 + 26); + b[ 27] = *(a11 + 27); + b[ 28] = *(a11 + 28); + b[ 29] = *(a11 + 29); + b[ 30] = *(a11 + 30); + b[ 31] = *(a11 + 31); + b += 32; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = *(a12 + 24); + b[ 25] = *(a12 + 25); + b[ 26] = *(a12 + 26); + b[ 27] = *(a12 + 27); + b[ 28] = *(a12 + 28); + b[ 29] = *(a12 + 29); + b[ 30] = *(a12 + 30); + b[ 31] = *(a12 + 31); + b += 32; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = *(a13 + 26); + b[ 27] = *(a13 + 27); + b[ 28] = *(a13 + 28); + b[ 29] = *(a12 + 29); + b[ 30] = *(a13 + 30); + b[ 31] = *(a13 + 31); + b += 32; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = *(a14 + 28); + b[ 29] = *(a14 + 29); + b[ 30] = *(a14 + 30); + b[ 31] = *(a14 + 31); + b += 32; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = *(a15 + 30); + b[ 31] = *(a15 + 31); + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a02 + 4); + b[ 21] = *(a02 + 5); + b[ 22] = *(a02 + 6); + b[ 23] = *(a02 + 7); + b[ 24] = *(a02 + 8); + b[ 25] = *(a02 + 9); + b[ 26] = *(a02 + 10); + b[ 27] = *(a02 + 11); + b[ 28] = *(a02 + 12); + b[ 29] = *(a02 + 13); + b[ 30] = *(a02 + 14); + b[ 31] = *(a02 + 15); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a03 + 6); + b[ 39] = *(a03 + 7); + b[ 40] = *(a03 + 8); + b[ 41] = *(a03 + 9); + b[ 42] = *(a03 + 10); + b[ 43] = *(a03 + 11); + b[ 44] = *(a03 + 12); + b[ 45] = *(a03 + 13); + b[ 46] = *(a03 + 14); + b[ 47] = *(a03 + 15); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a04 + 8); + b[ 57] = *(a04 + 9); + b[ 58] = *(a04 + 10); + b[ 59] = *(a04 + 11); + b[ 60] = *(a04 + 12); + b[ 61] = *(a04 + 13); + b[ 62] = *(a04 + 14); + b[ 63] = *(a04 + 15); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a05 + 10); + b[ 75] = *(a05 + 11); + b[ 76] = *(a05 + 12); + b[ 77] = *(a05 + 13); + b[ 78] = *(a05 + 14); + b[ 79] = *(a05 + 15); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a06 + 12); + b[ 93] = *(a06 + 13); + b[ 94] = *(a06 + 14); + b[ 95] = *(a06 + 15); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a07 + 14); + b[111] = *(a07 + 15); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; */ + b += 16 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[10] = *(a02 + 10); + b[11] = *(a02 + 11); + b[12] = *(a02 + 12); + b[13] = *(a02 + 13); + b[14] = *(a02 + 14); + b[15] = *(a02 + 15); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[10] = *(a03 + 10); + b[11] = *(a03 + 11); + b[12] = *(a03 + 12); + b[13] = *(a03 + 13); + b[14] = *(a03 + 14); + b[15] = *(a03 + 15); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[10] = *(a04 + 10); + b[11] = *(a04 + 11); + b[12] = *(a04 + 12); + b[13] = *(a04 + 13); + b[14] = *(a04 + 14); + b[15] = *(a04 + 15); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a05 + 10); + b[11] = *(a05 + 11); + b[12] = *(a05 + 12); + b[13] = *(a05 + 13); + b[14] = *(a05 + 14); + b[15] = *(a05 + 15); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a06 + 12); + b[13] = *(a06 + 13); + b[14] = *(a06 + 14); + b[15] = *(a06 + 15); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a07 + 14); + b[15] = *(a07 + 15); + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a02 + 4); + b[ 13] = *(a02 + 5); + b[ 14] = *(a02 + 6); + b[ 15] = *(a02 + 7); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a03 + 6); + b[ 23] = *(a03 + 7); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X > posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; */ + b += 8 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b += 8; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + b[4] = *(a02 + 0); + b[5] = *(a02 + 1); + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + + b[4] = ZERO; + b[5] = ZERO; +#ifdef UNIT + b[6] = ONE; + b[7] = ZERO; +#else + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X > posY) { + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + /* a01 += lda; + a02 += lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + a01 += 2; + b += 2; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + // posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_16.c b/kernel/generic/ztrmm_uncopy_16.c new file mode 100644 index 000000000..40b85db38 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_16.c @@ -0,0 +1,2316 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X > posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 512; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = *(a03 + 2); + b[ 37] = *(a03 + 3); + b[ 38] = *(a04 + 2); + b[ 39] = *(a04 + 3); + b[ 40] = *(a05 + 2); + b[ 41] = *(a05 + 3); + b[ 42] = *(a06 + 2); + b[ 43] = *(a06 + 3); + b[ 44] = *(a07 + 2); + b[ 45] = *(a07 + 3); + b[ 46] = *(a08 + 2); + b[ 47] = *(a08 + 3); + b[ 48] = *(a09 + 2); + b[ 49] = *(a09 + 3); + b[ 50] = *(a10 + 2); + b[ 51] = *(a10 + 3); + b[ 52] = *(a11 + 2); + b[ 53] = *(a11 + 3); + b[ 54] = *(a12 + 2); + b[ 55] = *(a12 + 3); + b[ 56] = *(a13 + 2); + b[ 57] = *(a13 + 3); + b[ 58] = *(a14 + 2); + b[ 59] = *(a14 + 3); + b[ 60] = *(a15 + 2); + b[ 61] = *(a15 + 3); + b[ 62] = *(a16 + 2); + b[ 63] = *(a16 + 3); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = *(a04 + 4); + b[ 71] = *(a04 + 5); + b[ 72] = *(a05 + 4); + b[ 73] = *(a05 + 5); + b[ 74] = *(a06 + 4); + b[ 75] = *(a06 + 5); + b[ 76] = *(a07 + 4); + b[ 77] = *(a07 + 5); + b[ 78] = *(a08 + 4); + b[ 79] = *(a08 + 5); + b[ 80] = *(a09 + 4); + b[ 81] = *(a09 + 5); + b[ 82] = *(a10 + 4); + b[ 83] = *(a10 + 5); + b[ 84] = *(a11 + 4); + b[ 85] = *(a11 + 5); + b[ 86] = *(a12 + 4); + b[ 87] = *(a12 + 5); + b[ 88] = *(a13 + 4); + b[ 89] = *(a13 + 5); + b[ 90] = *(a14 + 4); + b[ 91] = *(a14 + 5); + b[ 92] = *(a15 + 4); + b[ 93] = *(a15 + 5); + b[ 94] = *(a16 + 4); + b[ 95] = *(a16 + 5); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = *(a05 + 6); + b[105] = *(a05 + 7); + b[106] = *(a06 + 6); + b[107] = *(a06 + 7); + b[108] = *(a07 + 6); + b[109] = *(a07 + 7); + b[110] = *(a08 + 6); + b[111] = *(a08 + 7); + b[112] = *(a09 + 6); + b[113] = *(a09 + 7); + b[114] = *(a10 + 6); + b[115] = *(a10 + 7); + b[116] = *(a11 + 6); + b[117] = *(a11 + 7); + b[118] = *(a12 + 6); + b[119] = *(a12 + 7); + b[120] = *(a13 + 6); + b[121] = *(a13 + 7); + b[122] = *(a14 + 6); + b[123] = *(a14 + 7); + b[124] = *(a15 + 6); + b[125] = *(a15 + 7); + b[126] = *(a16 + 6); + b[127] = *(a16 + 7); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = *(a06 + 8); + b[139] = *(a06 + 9); + b[140] = *(a07 + 8); + b[141] = *(a07 + 9); + b[142] = *(a08 + 8); + b[143] = *(a08 + 9); + b[144] = *(a09 + 8); + b[145] = *(a09 + 9); + b[146] = *(a10 + 8); + b[147] = *(a10 + 9); + b[148] = *(a11 + 8); + b[149] = *(a11 + 9); + b[150] = *(a12 + 8); + b[151] = *(a12 + 9); + b[152] = *(a13 + 8); + b[153] = *(a13 + 9); + b[154] = *(a14 + 8); + b[155] = *(a14 + 9); + b[156] = *(a15 + 8); + b[157] = *(a15 + 9); + b[158] = *(a16 + 8); + b[159] = *(a16 + 9); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = *(a07 + 10); + b[173] = *(a07 + 11); + b[174] = *(a08 + 10); + b[175] = *(a08 + 11); + b[176] = *(a09 + 10); + b[177] = *(a09 + 11); + b[178] = *(a10 + 10); + b[179] = *(a10 + 11); + b[180] = *(a11 + 10); + b[181] = *(a11 + 11); + b[182] = *(a12 + 10); + b[183] = *(a12 + 11); + b[184] = *(a13 + 10); + b[185] = *(a13 + 11); + b[186] = *(a14 + 10); + b[187] = *(a14 + 11); + b[188] = *(a15 + 10); + b[189] = *(a15 + 11); + b[190] = *(a16 + 10); + b[191] = *(a16 + 11); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = *(a08 + 12); + b[207] = *(a08 + 13); + b[208] = *(a09 + 12); + b[209] = *(a09 + 13); + b[210] = *(a10 + 12); + b[211] = *(a10 + 13); + b[212] = *(a11 + 12); + b[213] = *(a11 + 13); + b[214] = *(a12 + 12); + b[215] = *(a12 + 13); + b[216] = *(a13 + 12); + b[217] = *(a13 + 13); + b[218] = *(a14 + 12); + b[219] = *(a14 + 13); + b[220] = *(a15 + 12); + b[221] = *(a15 + 13); + b[222] = *(a16 + 12); + b[223] = *(a16 + 13); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = *(a09 + 14); + b[241] = *(a09 + 15); + b[242] = *(a10 + 14); + b[243] = *(a10 + 15); + b[244] = *(a11 + 14); + b[245] = *(a11 + 15); + b[246] = *(a12 + 14); + b[247] = *(a12 + 15); + b[248] = *(a13 + 14); + b[249] = *(a13 + 15); + b[250] = *(a14 + 14); + b[251] = *(a14 + 15); + b[252] = *(a15 + 14); + b[253] = *(a15 + 15); + b[254] = *(a16 + 14); + b[255] = *(a16 + 15); + + b[256] = ZERO; + b[257] = ZERO; + b[258] = ZERO; + b[259] = ZERO; + b[260] = ZERO; + b[261] = ZERO; + b[262] = ZERO; + b[263] = ZERO; + b[264] = ZERO; + b[265] = ZERO; + b[266] = ZERO; + b[267] = ZERO; + b[268] = ZERO; + b[269] = ZERO; + b[270] = ZERO; + b[271] = ZERO; +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = *(a10 + 16); + b[275] = *(a10 + 17); + b[276] = *(a11 + 16); + b[277] = *(a11 + 17); + b[278] = *(a12 + 16); + b[279] = *(a12 + 17); + b[280] = *(a13 + 16); + b[281] = *(a13 + 17); + b[282] = *(a14 + 16); + b[283] = *(a14 + 17); + b[284] = *(a15 + 16); + b[285] = *(a15 + 17); + b[286] = *(a16 + 16); + b[287] = *(a16 + 17); + + b[288] = ZERO; + b[289] = ZERO; + b[290] = ZERO; + b[291] = ZERO; + b[292] = ZERO; + b[293] = ZERO; + b[294] = ZERO; + b[295] = ZERO; + b[296] = ZERO; + b[297] = ZERO; + b[298] = ZERO; + b[299] = ZERO; + b[300] = ZERO; + b[301] = ZERO; + b[302] = ZERO; + b[303] = ZERO; + b[304] = ZERO; + b[305] = ZERO; +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = *(a11 + 18); + b[309] = *(a11 + 19); + b[310] = *(a12 + 18); + b[311] = *(a12 + 19); + b[312] = *(a13 + 18); + b[313] = *(a13 + 19); + b[314] = *(a14 + 18); + b[315] = *(a14 + 19); + b[316] = *(a15 + 18); + b[317] = *(a15 + 19); + b[318] = *(a16 + 18); + b[319] = *(a16 + 19); + + b[320] = ZERO; + b[321] = ZERO; + b[322] = ZERO; + b[323] = ZERO; + b[324] = ZERO; + b[325] = ZERO; + b[326] = ZERO; + b[327] = ZERO; + b[328] = ZERO; + b[329] = ZERO; + b[330] = ZERO; + b[331] = ZERO; + b[332] = ZERO; + b[333] = ZERO; + b[334] = ZERO; + b[335] = ZERO; + b[336] = ZERO; + b[337] = ZERO; + b[338] = ZERO; + b[339] = ZERO; +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = *(a12 + 20); + b[343] = *(a12 + 21); + b[344] = *(a13 + 20); + b[345] = *(a13 + 21); + b[346] = *(a14 + 20); + b[347] = *(a14 + 21); + b[348] = *(a15 + 20); + b[349] = *(a15 + 21); + b[350] = *(a16 + 20); + b[351] = *(a16 + 21); + + b[352] = ZERO; + b[353] = ZERO; + b[354] = ZERO; + b[355] = ZERO; + b[356] = ZERO; + b[357] = ZERO; + b[358] = ZERO; + b[359] = ZERO; + b[360] = ZERO; + b[361] = ZERO; + b[362] = ZERO; + b[363] = ZERO; + b[364] = ZERO; + b[365] = ZERO; + b[366] = ZERO; + b[367] = ZERO; + b[368] = ZERO; + b[369] = ZERO; + b[370] = ZERO; + b[371] = ZERO; + b[372] = ZERO; + b[373] = ZERO; +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = *(a13 + 22); + b[377] = *(a13 + 23); + b[378] = *(a14 + 22); + b[379] = *(a14 + 23); + b[380] = *(a15 + 22); + b[381] = *(a15 + 23); + b[382] = *(a16 + 22); + b[383] = *(a16 + 23); + + b[384] = ZERO; + b[385] = ZERO; + b[386] = ZERO; + b[387] = ZERO; + b[388] = ZERO; + b[389] = ZERO; + b[390] = ZERO; + b[391] = ZERO; + b[392] = ZERO; + b[393] = ZERO; + b[394] = ZERO; + b[395] = ZERO; + b[396] = ZERO; + b[397] = ZERO; + b[398] = ZERO; + b[399] = ZERO; + b[400] = ZERO; + b[401] = ZERO; + b[402] = ZERO; + b[403] = ZERO; + b[404] = ZERO; + b[405] = ZERO; + b[406] = ZERO; + b[407] = ZERO; +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = *(a14 + 24); + b[411] = *(a14 + 25); + b[412] = *(a15 + 24); + b[413] = *(a15 + 25); + b[414] = *(a16 + 24); + b[415] = *(a16 + 25); + + b[416] = ZERO; + b[417] = ZERO; + b[418] = ZERO; + b[419] = ZERO; + b[420] = ZERO; + b[421] = ZERO; + b[422] = ZERO; + b[423] = ZERO; + b[424] = ZERO; + b[425] = ZERO; + b[426] = ZERO; + b[427] = ZERO; + b[428] = ZERO; + b[429] = ZERO; + b[430] = ZERO; + b[431] = ZERO; + b[432] = ZERO; + b[433] = ZERO; + b[434] = ZERO; + b[435] = ZERO; + b[436] = ZERO; + b[437] = ZERO; + b[438] = ZERO; + b[439] = ZERO; + b[440] = ZERO; + b[441] = ZERO; +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = *(a15 + 26); + b[445] = *(a15 + 27); + b[446] = *(a16 + 26); + b[447] = *(a16 + 27); + + b[448] = ZERO; + b[449] = ZERO; + b[450] = ZERO; + b[451] = ZERO; + b[452] = ZERO; + b[453] = ZERO; + b[454] = ZERO; + b[455] = ZERO; + b[456] = ZERO; + b[457] = ZERO; + b[458] = ZERO; + b[459] = ZERO; + b[460] = ZERO; + b[461] = ZERO; + b[462] = ZERO; + b[463] = ZERO; + b[464] = ZERO; + b[465] = ZERO; + b[466] = ZERO; + b[467] = ZERO; + b[468] = ZERO; + b[469] = ZERO; + b[470] = ZERO; + b[471] = ZERO; + b[472] = ZERO; + b[473] = ZERO; + b[474] = ZERO; + b[475] = ZERO; +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = *(a16 + 28); + b[479] = *(a16 + 29); + + b[480] = ZERO; + b[481] = ZERO; + b[482] = ZERO; + b[483] = ZERO; + b[484] = ZERO; + b[485] = ZERO; + b[486] = ZERO; + b[487] = ZERO; + b[488] = ZERO; + b[489] = ZERO; + b[490] = ZERO; + b[491] = ZERO; + b[492] = ZERO; + b[493] = ZERO; + b[494] = ZERO; + b[495] = ZERO; + b[496] = ZERO; + b[497] = ZERO; + b[498] = ZERO; + b[499] = ZERO; + b[500] = ZERO; + b[501] = ZERO; + b[502] = ZERO; + b[503] = ZERO; + b[504] = ZERO; + b[505] = ZERO; + b[506] = ZERO; + b[507] = ZERO; + b[508] = ZERO; + b[509] = ZERO; +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + b += 32; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b[ 8] = *(a05 + 2); + b[ 9] = *(a05 + 3); + b[ 10] = *(a06 + 2); + b[ 11] = *(a06 + 3); + b[ 12] = *(a07 + 2); + b[ 13] = *(a07 + 3); + b[ 14] = *(a08 + 2); + b[ 15] = *(a08 + 3); + b[ 16] = *(a09 + 2); + b[ 17] = *(a09 + 3); + b[ 18] = *(a10 + 2); + b[ 19] = *(a10 + 3); + b[ 20] = *(a11 + 2); + b[ 21] = *(a11 + 3); + b[ 22] = *(a12 + 2); + b[ 23] = *(a12 + 3); + b[ 24] = *(a13 + 2); + b[ 25] = *(a13 + 3); + b[ 26] = *(a14 + 2); + b[ 27] = *(a14 + 3); + b[ 28] = *(a15 + 2); + b[ 29] = *(a15 + 3); + b[ 30] = *(a16 + 2); + b[ 31] = *(a16 + 3); + b += 32; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b[ 8] = *(a05 + 4); + b[ 9] = *(a05 + 5); + b[ 10] = *(a06 + 4); + b[ 11] = *(a06 + 5); + b[ 12] = *(a07 + 4); + b[ 13] = *(a07 + 5); + b[ 14] = *(a08 + 4); + b[ 15] = *(a08 + 5); + b[ 16] = *(a09 + 4); + b[ 17] = *(a09 + 5); + b[ 18] = *(a10 + 4); + b[ 19] = *(a10 + 5); + b[ 20] = *(a11 + 4); + b[ 21] = *(a11 + 5); + b[ 22] = *(a12 + 4); + b[ 23] = *(a12 + 5); + b[ 24] = *(a13 + 4); + b[ 25] = *(a13 + 5); + b[ 26] = *(a14 + 4); + b[ 27] = *(a14 + 5); + b[ 28] = *(a15 + 4); + b[ 29] = *(a15 + 5); + b[ 30] = *(a16 + 4); + b[ 31] = *(a16 + 5); + b += 32; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a05 + 6); + b[ 9] = *(a05 + 7); + b[ 10] = *(a06 + 6); + b[ 11] = *(a06 + 7); + b[ 12] = *(a07 + 6); + b[ 13] = *(a07 + 7); + b[ 14] = *(a08 + 6); + b[ 15] = *(a08 + 7); + b[ 16] = *(a09 + 6); + b[ 17] = *(a09 + 7); + b[ 18] = *(a10 + 6); + b[ 19] = *(a10 + 7); + b[ 20] = *(a11 + 6); + b[ 21] = *(a11 + 7); + b[ 22] = *(a12 + 6); + b[ 23] = *(a12 + 7); + b[ 24] = *(a13 + 6); + b[ 25] = *(a13 + 7); + b[ 26] = *(a14 + 6); + b[ 27] = *(a14 + 7); + b[ 28] = *(a15 + 6); + b[ 29] = *(a15 + 7); + b[ 30] = *(a16 + 6); + b[ 31] = *(a16 + 7); + b += 32; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = *(a06 + 8); + b[ 11] = *(a06 + 9); + b[ 12] = *(a07 + 8); + b[ 13] = *(a07 + 9); + b[ 14] = *(a08 + 8); + b[ 15] = *(a08 + 9); + b[ 16] = *(a09 + 8); + b[ 17] = *(a09 + 9); + b[ 18] = *(a10 + 8); + b[ 19] = *(a10 + 9); + b[ 20] = *(a11 + 8); + b[ 21] = *(a11 + 9); + b[ 22] = *(a12 + 8); + b[ 23] = *(a12 + 9); + b[ 24] = *(a13 + 8); + b[ 25] = *(a13 + 9); + b[ 26] = *(a14 + 8); + b[ 27] = *(a14 + 9); + b[ 28] = *(a15 + 8); + b[ 29] = *(a15 + 9); + b[ 30] = *(a16 + 8); + b[ 31] = *(a16 + 9); + b += 32; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = *(a07 + 10); + b[ 13] = *(a07 + 11); + b[ 14] = *(a08 + 10); + b[ 15] = *(a08 + 11); + b[ 16] = *(a09 + 10); + b[ 17] = *(a09 + 11); + b[ 18] = *(a10 + 10); + b[ 19] = *(a10 + 11); + b[ 20] = *(a11 + 10); + b[ 21] = *(a11 + 11); + b[ 22] = *(a12 + 10); + b[ 23] = *(a12 + 11); + b[ 24] = *(a13 + 10); + b[ 25] = *(a13 + 11); + b[ 26] = *(a14 + 10); + b[ 27] = *(a14 + 11); + b[ 28] = *(a15 + 10); + b[ 29] = *(a15 + 11); + b[ 30] = *(a16 + 10); + b[ 31] = *(a16 + 11); + b += 32; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = *(a08 + 12); + b[ 15] = *(a08 + 13); + b[ 16] = *(a09 + 12); + b[ 17] = *(a09 + 13); + b[ 18] = *(a10 + 12); + b[ 19] = *(a10 + 13); + b[ 20] = *(a11 + 12); + b[ 21] = *(a11 + 13); + b[ 22] = *(a12 + 12); + b[ 23] = *(a12 + 13); + b[ 24] = *(a13 + 12); + b[ 25] = *(a13 + 13); + b[ 26] = *(a14 + 12); + b[ 27] = *(a14 + 13); + b[ 28] = *(a15 + 12); + b[ 29] = *(a15 + 13); + b[ 30] = *(a16 + 12); + b[ 31] = *(a16 + 13); + b += 32; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = *(a09 + 14); + b[ 17] = *(a09 + 15); + b[ 18] = *(a10 + 14); + b[ 19] = *(a10 + 15); + b[ 20] = *(a11 + 14); + b[ 21] = *(a11 + 15); + b[ 22] = *(a12 + 14); + b[ 23] = *(a12 + 15); + b[ 24] = *(a13 + 14); + b[ 25] = *(a13 + 15); + b[ 26] = *(a14 + 14); + b[ 27] = *(a14 + 15); + b[ 28] = *(a15 + 14); + b[ 29] = *(a15 + 15); + b[ 30] = *(a16 + 14); + b[ 31] = *(a16 + 15); + b += 32; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = *(a10 + 16); + b[ 19] = *(a10 + 17); + b[ 20] = *(a11 + 16); + b[ 21] = *(a11 + 17); + b[ 22] = *(a12 + 16); + b[ 23] = *(a12 + 17); + b[ 24] = *(a13 + 16); + b[ 25] = *(a13 + 17); + b[ 26] = *(a14 + 16); + b[ 27] = *(a14 + 17); + b[ 28] = *(a15 + 16); + b[ 29] = *(a15 + 17); + b[ 30] = *(a16 + 16); + b[ 31] = *(a16 + 17); + b += 32; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = *(a11 + 18); + b[ 21] = *(a11 + 19); + b[ 22] = *(a12 + 18); + b[ 23] = *(a12 + 19); + b[ 24] = *(a13 + 18); + b[ 25] = *(a13 + 19); + b[ 26] = *(a14 + 18); + b[ 27] = *(a14 + 19); + b[ 28] = *(a15 + 18); + b[ 29] = *(a15 + 19); + b[ 30] = *(a16 + 18); + b[ 31] = *(a16 + 19); + b += 32; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = *(a12 + 20); + b[ 23] = *(a12 + 21); + b[ 24] = *(a13 + 20); + b[ 25] = *(a13 + 21); + b[ 26] = *(a14 + 20); + b[ 27] = *(a14 + 21); + b[ 28] = *(a15 + 20); + b[ 29] = *(a15 + 21); + b[ 30] = *(a16 + 20); + b[ 31] = *(a16 + 21); + b += 32; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = *(a13 + 22); + b[ 25] = *(a13 + 23); + b[ 26] = *(a14 + 22); + b[ 27] = *(a14 + 23); + b[ 28] = *(a15 + 22); + b[ 29] = *(a15 + 23); + b[ 30] = *(a16 + 22); + b[ 31] = *(a16 + 23); + b += 32; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = *(a14 + 24); + b[ 27] = *(a14 + 25); + b[ 28] = *(a15 + 24); + b[ 29] = *(a15 + 25); + b[ 30] = *(a16 + 24); + b[ 31] = *(a16 + 25); + b += 32; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = *(a15 + 26); + b[ 29] = *(a15 + 27); + b[ 30] = *(a16 + 26); + b[ 31] = *(a16 + 27); + b += 32; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = *(a16 + 28); + b[ 31] = *(a16 + 29); + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X > posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 128; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a03 + 2); + b[ 21] = *(a03 + 3); + b[ 22] = *(a04 + 2); + b[ 23] = *(a04 + 3); + b[ 24] = *(a05 + 2); + b[ 25] = *(a05 + 3); + b[ 26] = *(a06 + 2); + b[ 27] = *(a06 + 3); + b[ 28] = *(a07 + 2); + b[ 29] = *(a07 + 3); + b[ 30] = *(a08 + 2); + b[ 31] = *(a08 + 3); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a04 + 4); + b[ 39] = *(a04 + 5); + b[ 40] = *(a05 + 4); + b[ 41] = *(a05 + 5); + b[ 42] = *(a06 + 4); + b[ 43] = *(a06 + 5); + b[ 44] = *(a07 + 4); + b[ 45] = *(a07 + 5); + b[ 46] = *(a08 + 4); + b[ 47] = *(a08 + 5); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a05 + 6); + b[ 57] = *(a05 + 7); + b[ 58] = *(a06 + 6); + b[ 59] = *(a06 + 7); + b[ 60] = *(a07 + 6); + b[ 61] = *(a07 + 7); + b[ 62] = *(a08 + 6); + b[ 63] = *(a08 + 7); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a06 + 8); + b[ 75] = *(a06 + 9); + b[ 76] = *(a07 + 8); + b[ 77] = *(a07 + 9); + b[ 78] = *(a08 + 8); + b[ 79] = *(a08 + 9); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a07 + 10); + b[ 93] = *(a07 + 11); + b[ 94] = *(a08 + 10); + b[ 95] = *(a08 + 11); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a08 + 12); + b[111] = *(a08 + 13); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; */ + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[10] = *(a06 + 0); + b[11] = *(a06 + 1); + b[12] = *(a07 + 0); + b[13] = *(a07 + 1); + b[14] = *(a08 + 0); + b[15] = *(a08 + 1); + b += 16; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b[ 8] = *(a05 + 2); + b[ 9] = *(a05 + 3); + b[10] = *(a06 + 2); + b[11] = *(a06 + 3); + b[12] = *(a07 + 2); + b[13] = *(a07 + 3); + b[14] = *(a08 + 2); + b[15] = *(a08 + 3); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b[ 8] = *(a05 + 4); + b[ 9] = *(a05 + 5); + b[10] = *(a06 + 4); + b[11] = *(a06 + 5); + b[12] = *(a07 + 4); + b[13] = *(a07 + 5); + b[14] = *(a08 + 4); + b[15] = *(a08 + 5); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a05 + 6); + b[ 9] = *(a05 + 7); + b[10] = *(a06 + 6); + b[11] = *(a06 + 7); + b[12] = *(a07 + 6); + b[13] = *(a07 + 7); + b[14] = *(a08 + 6); + b[15] = *(a08 + 7); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a06 + 8); + b[11] = *(a06 + 9); + b[12] = *(a07 + 8); + b[13] = *(a07 + 9); + b[14] = *(a08 + 8); + b[15] = *(a08 + 9); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a07 + 10); + b[13] = *(a07 + 11); + b[14] = *(a08 + 10); + b[15] = *(a08 + 11); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a08 + 12); + b[15] = *(a08 + 13); + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X > posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a03 + 2); + b[ 13] = *(a03 + 3); + b[ 14] = *(a04 + 2); + b[ 15] = *(a04 + 3); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a04 + 4); + b[ 23] = *(a04 + 5); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; */ + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X > posY) { + /* a01 += 2 * lda; + a02 += 2 * lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += 2; + b += 2; + } else + if (X > posY) { + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += lda; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_16.c b/kernel/generic/ztrmm_utcopy_16.c new file mode 100644 index 000000000..5aba3727a --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_16.c @@ -0,0 +1,2318 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + + b += 512; + } else + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + b += 32; + } + + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a02 + 0); + b[ 33] = *(a02 + 1); +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a03 + 0); + b[ 65] = *(a03 + 1); + b[ 66] = *(a03 + 2); + b[ 67] = *(a03 + 3); +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a04 + 0); + b[ 97] = *(a04 + 1); + b[ 98] = *(a04 + 2); + b[ 99] = *(a04 + 3); + b[100] = *(a04 + 4); + b[101] = *(a04 + 5); +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a05 + 0); + b[129] = *(a05 + 1); + b[130] = *(a05 + 2); + b[131] = *(a05 + 3); + b[132] = *(a05 + 4); + b[133] = *(a05 + 5); + b[134] = *(a05 + 6); + b[135] = *(a05 + 7); +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; + b[153] = ZERO; + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a06 + 0); + b[161] = *(a06 + 1); + b[162] = *(a06 + 2); + b[163] = *(a06 + 3); + b[164] = *(a06 + 4); + b[165] = *(a06 + 5); + b[166] = *(a06 + 6); + b[167] = *(a06 + 7); + b[168] = *(a06 + 8); + b[169] = *(a06 + 9); +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; + b[187] = ZERO; + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a07 + 0); + b[193] = *(a07 + 1); + b[194] = *(a07 + 2); + b[195] = *(a07 + 3); + b[196] = *(a07 + 4); + b[197] = *(a07 + 5); + b[198] = *(a07 + 6); + b[199] = *(a07 + 7); + b[200] = *(a07 + 8); + b[201] = *(a07 + 9); + b[202] = *(a07 + 10); + b[203] = *(a07 + 11); +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = ZERO; + b[207] = ZERO; + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; + b[221] = ZERO; + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a08 + 0); + b[225] = *(a08 + 1); + b[226] = *(a08 + 2); + b[227] = *(a08 + 3); + b[228] = *(a08 + 4); + b[229] = *(a08 + 5); + b[230] = *(a08 + 6); + b[231] = *(a08 + 7); + b[232] = *(a08 + 8); + b[233] = *(a08 + 9); + b[234] = *(a08 + 10); + b[235] = *(a08 + 11); + b[236] = *(a08 + 12); + b[237] = *(a08 + 13); +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; + b[255] = ZERO; + + b[256] = *(a09 + 0); + b[257] = *(a09 + 1); + b[258] = *(a09 + 2); + b[259] = *(a09 + 3); + b[260] = *(a09 + 4); + b[261] = *(a09 + 5); + b[262] = *(a09 + 6); + b[263] = *(a09 + 7); + b[264] = *(a09 + 8); + b[265] = *(a09 + 9); + b[266] = *(a09 + 10); + b[267] = *(a09 + 11); + b[268] = *(a09 + 12); + b[269] = *(a09 + 13); + b[270] = *(a09 + 14); + b[271] = *(a09 + 15); +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = ZERO; + b[275] = ZERO; + b[276] = ZERO; + b[277] = ZERO; + b[278] = ZERO; + b[279] = ZERO; + b[280] = ZERO; + b[281] = ZERO; + b[282] = ZERO; + b[283] = ZERO; + b[284] = ZERO; + b[285] = ZERO; + b[286] = ZERO; + b[287] = ZERO; + + b[288] = *(a10 + 0); + b[289] = *(a10 + 1); + b[290] = *(a10 + 2); + b[291] = *(a10 + 3); + b[292] = *(a10 + 4); + b[293] = *(a10 + 5); + b[294] = *(a10 + 6); + b[295] = *(a10 + 7); + b[296] = *(a10 + 8); + b[297] = *(a10 + 9); + b[298] = *(a10 + 10); + b[299] = *(a10 + 11); + b[300] = *(a10 + 12); + b[301] = *(a10 + 13); + b[302] = *(a10 + 14); + b[303] = *(a10 + 15); + b[304] = *(a10 + 16); + b[305] = *(a10 + 17); +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = ZERO; + b[309] = ZERO; + b[310] = ZERO; + b[311] = ZERO; + b[312] = ZERO; + b[313] = ZERO; + b[314] = ZERO; + b[315] = ZERO; + b[316] = ZERO; + b[317] = ZERO; + b[318] = ZERO; + b[319] = ZERO; + + b[320] = *(a11 + 0); + b[321] = *(a11 + 1); + b[322] = *(a11 + 2); + b[323] = *(a11 + 3); + b[324] = *(a11 + 4); + b[325] = *(a11 + 5); + b[326] = *(a11 + 6); + b[327] = *(a11 + 7); + b[328] = *(a11 + 8); + b[329] = *(a11 + 9); + b[330] = *(a11 + 10); + b[331] = *(a11 + 11); + b[332] = *(a11 + 12); + b[333] = *(a11 + 13); + b[334] = *(a11 + 14); + b[335] = *(a11 + 15); + b[336] = *(a11 + 16); + b[337] = *(a11 + 17); + b[338] = *(a11 + 18); + b[339] = *(a11 + 19); +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = ZERO; + b[343] = ZERO; + b[344] = ZERO; + b[345] = ZERO; + b[346] = ZERO; + b[347] = ZERO; + b[348] = ZERO; + b[349] = ZERO; + b[350] = ZERO; + b[351] = ZERO; + + b[352] = *(a12 + 0); + b[353] = *(a12 + 1); + b[354] = *(a12 + 2); + b[355] = *(a12 + 3); + b[356] = *(a12 + 4); + b[357] = *(a12 + 5); + b[358] = *(a12 + 6); + b[359] = *(a12 + 7); + b[360] = *(a12 + 8); + b[361] = *(a12 + 9); + b[362] = *(a12 + 10); + b[363] = *(a12 + 11); + b[364] = *(a12 + 12); + b[365] = *(a12 + 13); + b[366] = *(a12 + 14); + b[367] = *(a12 + 15); + b[368] = *(a12 + 16); + b[369] = *(a12 + 17); + b[370] = *(a12 + 18); + b[371] = *(a12 + 19); + b[372] = *(a12 + 20); + b[373] = *(a12 + 21); +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = ZERO; + b[377] = ZERO; + b[378] = ZERO; + b[379] = ZERO; + b[380] = ZERO; + b[381] = ZERO; + b[382] = ZERO; + b[383] = ZERO; + + b[384] = *(a13 + 0); + b[385] = *(a13 + 1); + b[386] = *(a13 + 2); + b[387] = *(a13 + 3); + b[388] = *(a13 + 4); + b[389] = *(a13 + 5); + b[390] = *(a13 + 6); + b[391] = *(a13 + 7); + b[392] = *(a13 + 8); + b[393] = *(a13 + 9); + b[394] = *(a13 + 10); + b[395] = *(a13 + 11); + b[396] = *(a13 + 12); + b[397] = *(a13 + 13); + b[398] = *(a13 + 14); + b[399] = *(a13 + 15); + b[400] = *(a13 + 16); + b[401] = *(a13 + 17); + b[402] = *(a13 + 18); + b[403] = *(a13 + 19); + b[404] = *(a13 + 20); + b[405] = *(a13 + 21); + b[406] = *(a13 + 22); + b[407] = *(a13 + 23); +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = ZERO; + b[411] = ZERO; + b[412] = ZERO; + b[413] = ZERO; + b[414] = ZERO; + b[415] = ZERO; + + b[416] = *(a14 + 0); + b[417] = *(a14 + 1); + b[418] = *(a14 + 2); + b[419] = *(a14 + 3); + b[420] = *(a14 + 4); + b[421] = *(a14 + 5); + b[422] = *(a14 + 6); + b[423] = *(a14 + 7); + b[424] = *(a14 + 8); + b[425] = *(a14 + 9); + b[426] = *(a14 + 10); + b[427] = *(a14 + 11); + b[428] = *(a14 + 12); + b[429] = *(a14 + 13); + b[430] = *(a14 + 14); + b[431] = *(a14 + 15); + b[432] = *(a14 + 16); + b[433] = *(a14 + 17); + b[434] = *(a14 + 18); + b[435] = *(a14 + 19); + b[436] = *(a14 + 20); + b[437] = *(a14 + 21); + b[438] = *(a14 + 22); + b[439] = *(a14 + 23); + b[440] = *(a14 + 24); + b[441] = *(a14 + 25); +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = ZERO; + b[445] = ZERO; + b[446] = ZERO; + b[447] = ZERO; + + b[448] = *(a15 + 0); + b[449] = *(a15 + 1); + b[450] = *(a15 + 2); + b[451] = *(a15 + 3); + b[452] = *(a15 + 4); + b[453] = *(a15 + 5); + b[454] = *(a15 + 6); + b[455] = *(a15 + 7); + b[456] = *(a15 + 8); + b[457] = *(a15 + 9); + b[458] = *(a15 + 10); + b[459] = *(a15 + 11); + b[460] = *(a15 + 12); + b[461] = *(a15 + 13); + b[462] = *(a15 + 14); + b[463] = *(a15 + 15); + b[464] = *(a15 + 16); + b[465] = *(a15 + 17); + b[466] = *(a15 + 18); + b[467] = *(a15 + 19); + b[468] = *(a15 + 20); + b[469] = *(a15 + 21); + b[470] = *(a15 + 22); + b[471] = *(a15 + 23); + b[472] = *(a15 + 24); + b[473] = *(a15 + 25); + b[474] = *(a15 + 26); + b[475] = *(a15 + 27); +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = ZERO; + b[479] = ZERO; + + b[480] = *(a16 + 0); + b[481] = *(a16 + 1); + b[482] = *(a16 + 2); + b[483] = *(a16 + 3); + b[484] = *(a16 + 4); + b[485] = *(a16 + 5); + b[486] = *(a16 + 6); + b[487] = *(a16 + 7); + b[488] = *(a16 + 8); + b[489] = *(a16 + 9); + b[490] = *(a16 + 10); + b[491] = *(a16 + 11); + b[492] = *(a16 + 12); + b[493] = *(a16 + 13); + b[494] = *(a16 + 14); + b[495] = *(a16 + 15); + b[496] = *(a16 + 16); + b[497] = *(a16 + 17); + b[498] = *(a16 + 18); + b[499] = *(a16 + 19); + b[500] = *(a16 + 20); + b[501] = *(a16 + 21); + b[502] = *(a16 + 22); + b[503] = *(a16 + 23); + b[504] = *(a16 + 24); + b[505] = *(a16 + 25); + b[506] = *(a16 + 26); + b[507] = *(a16 + 27); + b[508] = *(a16 + 28); + b[509] = *(a16 + 29); +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + // a01 += 2 * i; + // a02 += 2 * i; + // a03 += 2 * i; + // a04 += 2 * i; + // a05 += 2 * i; + // a06 += 2 * i; + // a07 += 2 * i; + // a08 += 2 * i; + // a09 += 2 * i; + // a10 += 2 * i; + // a11 += 2 * i; + // a12 += 2 * i; + // a13 += 2 * i; + // a14 += 2 * i; + // a15 += 2 * i; + // a16 += 2 * i; + b += 32 * i; + + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 32; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + + if (i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 8) { + b[ 0] = *(a08 + 0); + b[ 1] = *(a08 + 1); + b[ 2] = *(a08 + 2); + b[ 3] = *(a08 + 3); + b[ 4] = *(a08 + 4); + b[ 5] = *(a08 + 5); + b[ 6] = *(a08 + 6); + b[ 7] = *(a08 + 7); + b[ 8] = *(a08 + 8); + b[ 9] = *(a08 + 9); + b[ 10] = *(a08 + 10); + b[ 11] = *(a08 + 11); + b[ 12] = *(a08 + 12); + b[ 13] = *(a08 + 13); +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 9) { + b[ 0] = *(a09 + 0); + b[ 1] = *(a09 + 1); + b[ 2] = *(a09 + 2); + b[ 3] = *(a09 + 3); + b[ 4] = *(a09 + 4); + b[ 5] = *(a09 + 5); + b[ 6] = *(a09 + 6); + b[ 7] = *(a09 + 7); + b[ 8] = *(a09 + 8); + b[ 9] = *(a09 + 9); + b[ 10] = *(a09 + 10); + b[ 11] = *(a09 + 11); + b[ 12] = *(a09 + 12); + b[ 13] = *(a09 + 13); + b[ 14] = *(a09 + 14); + b[ 15] = *(a09 + 15); +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 10) { + b[ 0] = *(a10 + 0); + b[ 1] = *(a10 + 1); + b[ 2] = *(a10 + 2); + b[ 3] = *(a10 + 3); + b[ 4] = *(a10 + 4); + b[ 5] = *(a10 + 5); + b[ 6] = *(a10 + 6); + b[ 7] = *(a10 + 7); + b[ 8] = *(a10 + 8); + b[ 9] = *(a10 + 9); + b[ 10] = *(a10 + 10); + b[ 11] = *(a10 + 11); + b[ 12] = *(a10 + 12); + b[ 13] = *(a10 + 13); + b[ 14] = *(a10 + 14); + b[ 15] = *(a10 + 15); + b[ 16] = *(a10 + 16); + b[ 17] = *(a10 + 17); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 11) { + b[ 0] = *(a11 + 0); + b[ 1] = *(a11 + 1); + b[ 2] = *(a11 + 2); + b[ 3] = *(a11 + 3); + b[ 4] = *(a11 + 4); + b[ 5] = *(a11 + 5); + b[ 6] = *(a11 + 6); + b[ 7] = *(a11 + 7); + b[ 8] = *(a11 + 8); + b[ 9] = *(a11 + 9); + b[ 10] = *(a11 + 10); + b[ 11] = *(a11 + 11); + b[ 12] = *(a11 + 12); + b[ 13] = *(a11 + 13); + b[ 14] = *(a11 + 14); + b[ 15] = *(a11 + 15); + b[ 16] = *(a11 + 16); + b[ 17] = *(a11 + 17); + b[ 18] = *(a11 + 18); + b[ 19] = *(a11 + 19); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 12) { + b[ 0] = *(a12 + 0); + b[ 1] = *(a12 + 1); + b[ 2] = *(a12 + 2); + b[ 3] = *(a12 + 3); + b[ 4] = *(a12 + 4); + b[ 5] = *(a12 + 5); + b[ 6] = *(a12 + 6); + b[ 7] = *(a12 + 7); + b[ 8] = *(a12 + 8); + b[ 9] = *(a12 + 9); + b[ 10] = *(a12 + 10); + b[ 11] = *(a12 + 11); + b[ 12] = *(a12 + 12); + b[ 13] = *(a12 + 13); + b[ 14] = *(a12 + 14); + b[ 15] = *(a12 + 15); + b[ 16] = *(a12 + 16); + b[ 17] = *(a12 + 17); + b[ 18] = *(a12 + 18); + b[ 19] = *(a12 + 19); + b[ 20] = *(a12 + 20); + b[ 21] = *(a12 + 21); +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 13) { + b[ 0] = *(a13 + 0); + b[ 1] = *(a13 + 1); + b[ 2] = *(a13 + 2); + b[ 3] = *(a13 + 3); + b[ 4] = *(a13 + 4); + b[ 5] = *(a13 + 5); + b[ 6] = *(a13 + 6); + b[ 7] = *(a13 + 7); + b[ 8] = *(a13 + 8); + b[ 9] = *(a13 + 9); + b[ 10] = *(a13 + 10); + b[ 11] = *(a13 + 11); + b[ 12] = *(a13 + 12); + b[ 13] = *(a13 + 13); + b[ 14] = *(a13 + 14); + b[ 15] = *(a13 + 15); + b[ 16] = *(a13 + 16); + b[ 17] = *(a13 + 17); + b[ 18] = *(a13 + 18); + b[ 19] = *(a13 + 19); + b[ 20] = *(a13 + 20); + b[ 21] = *(a13 + 21); + b[ 22] = *(a13 + 22); + b[ 23] = *(a13 + 23); +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 14) { + b[ 0] = *(a14 + 0); + b[ 1] = *(a14 + 1); + b[ 2] = *(a14 + 2); + b[ 3] = *(a14 + 3); + b[ 4] = *(a14 + 4); + b[ 5] = *(a14 + 5); + b[ 6] = *(a14 + 6); + b[ 7] = *(a14 + 7); + b[ 8] = *(a14 + 8); + b[ 9] = *(a14 + 9); + b[ 10] = *(a14 + 10); + b[ 11] = *(a14 + 11); + b[ 12] = *(a14 + 12); + b[ 13] = *(a14 + 13); + b[ 14] = *(a14 + 14); + b[ 15] = *(a14 + 15); + b[ 16] = *(a14 + 16); + b[ 17] = *(a14 + 17); + b[ 18] = *(a14 + 18); + b[ 19] = *(a14 + 19); + b[ 20] = *(a14 + 20); + b[ 21] = *(a14 + 21); + b[ 22] = *(a14 + 22); + b[ 23] = *(a14 + 23); + b[ 24] = *(a14 + 24); + b[ 25] = *(a14 + 25); +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 15) { + b[ 0] = *(a15 + 0); + b[ 1] = *(a15 + 1); + b[ 2] = *(a15 + 2); + b[ 3] = *(a15 + 3); + b[ 4] = *(a15 + 4); + b[ 5] = *(a15 + 5); + b[ 6] = *(a15 + 6); + b[ 7] = *(a15 + 7); + b[ 8] = *(a15 + 8); + b[ 9] = *(a15 + 9); + b[ 10] = *(a15 + 10); + b[ 11] = *(a15 + 11); + b[ 12] = *(a15 + 12); + b[ 13] = *(a15 + 13); + b[ 14] = *(a15 + 14); + b[ 15] = *(a15 + 15); + b[ 16] = *(a15 + 16); + b[ 17] = *(a15 + 17); + b[ 18] = *(a15 + 18); + b[ 19] = *(a15 + 19); + b[ 20] = *(a15 + 20); + b[ 21] = *(a15 + 21); + b[ 22] = *(a15 + 22); + b[ 23] = *(a15 + 23); + b[ 24] = *(a15 + 24); + b[ 25] = *(a15 + 25); + b[ 26] = *(a15 + 26); + b[ 27] = *(a15 + 27); +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a02 + 0); + b[ 17] = *(a02 + 1); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a03 + 0); + b[ 33] = *(a03 + 1); + b[ 34] = *(a03 + 2); + b[ 35] = *(a03 + 3); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a04 + 0); + b[ 49] = *(a04 + 1); + b[ 50] = *(a04 + 2); + b[ 51] = *(a04 + 3); + b[ 52] = *(a04 + 4); + b[ 53] = *(a04 + 5); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a05 + 0); + b[ 65] = *(a05 + 1); + b[ 66] = *(a05 + 2); + b[ 67] = *(a05 + 3); + b[ 68] = *(a05 + 4); + b[ 69] = *(a05 + 5); + b[ 70] = *(a05 + 6); + b[ 71] = *(a05 + 7); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a06 + 0); + b[ 81] = *(a06 + 1); + b[ 82] = *(a06 + 2); + b[ 83] = *(a06 + 3); + b[ 84] = *(a06 + 4); + b[ 85] = *(a06 + 5); + b[ 86] = *(a06 + 6); + b[ 87] = *(a06 + 7); + b[ 88] = *(a06 + 8); + b[ 89] = *(a06 + 9); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a07 + 0); + b[ 97] = *(a07 + 1); + b[ 98] = *(a07 + 2); + b[ 99] = *(a07 + 3); + b[100] = *(a07 + 4); + b[101] = *(a07 + 5); + b[102] = *(a07 + 6); + b[103] = *(a07 + 7); + b[104] = *(a07 + 8); + b[105] = *(a07 + 9); + b[106] = *(a07 + 10); + b[107] = *(a07 + 11); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a08 + 0); + b[113] = *(a08 + 1); + b[114] = *(a08 + 2); + b[115] = *(a08 + 3); + b[116] = *(a08 + 4); + b[117] = *(a08 + 5); + b[118] = *(a08 + 6); + b[119] = *(a08 + 7); + b[120] = *(a08 + 8); + b[121] = *(a08 + 9); + b[122] = *(a08 + 10); + b[123] = *(a08 + 11); + b[124] = *(a08 + 12); + b[125] = *(a08 + 13); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; */ + b += 16 * i; + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X > posY) { + + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a02 + 0); + b[ 9] = *(a02 + 1); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a03 + 0); + b[ 17] = *(a03 + 1); + b[ 18] = *(a03 + 2); + b[ 19] = *(a03 + 3); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a04 + 0); + b[ 25] = *(a04 + 1); + b[ 26] = *(a04 + 2); + b[ 27] = *(a04 + 3); + b[ 28] = *(a04 + 4); + b[ 29] = *(a04 + 5); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; */ + b += 8 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + b += 4; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } +#if 1 + } +#else + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b += 4; + } +#endif + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + a01 += 2; + } else { +#ifdef UNIT + if (X > posY) { +#endif + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#ifdef UNIT + } else { + b[ 0] = ONE; + b[ 1] = ZERO; + } +#endif + a01 += lda; + } + b += 2; + X ++; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmmkernel_4x4.c b/kernel/generic/ztrmmkernel_4x4.c old mode 100755 new mode 100644 diff --git a/kernel/generic/ztrsm_lncopy_16.c b/kernel/generic/ztrsm_lncopy_16.c new file mode 100644 index 000000000..4fd72c13e --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_16.c @@ -0,0 +1,308 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + *(b + 16) = *(a9 + 0); + *(b + 17) = *(a9 + 1); + *(b + 18) = *(a10 + 0); + *(b + 19) = *(a10 + 1); + *(b + 20) = *(a11 + 0); + *(b + 21) = *(a11 + 1); + *(b + 22) = *(a12 + 0); + *(b + 23) = *(a12 + 1); + *(b + 24) = *(a13 + 0); + *(b + 25) = *(a13 + 1); + *(b + 26) = *(a14 + 0); + *(b + 27) = *(a14 + 1); + *(b + 28) = *(a15 + 0); + *(b + 29) = *(a15 + 1); + *(b + 30) = *(a16 + 0); + *(b + 31) = *(a16 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + a9 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_16.c b/kernel/generic/ztrsm_ltcopy_16.c new file mode 100644 index 000000000..e9aeae1ad --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_16.c @@ -0,0 +1,264 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 32; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + *(b + 16) = *(a1 + 16); + *(b + 17) = *(a1 + 17); + *(b + 18) = *(a1 + 18); + *(b + 19) = *(a1 + 19); + *(b + 20) = *(a1 + 20); + *(b + 21) = *(a1 + 21); + *(b + 22) = *(a1 + 22); + *(b + 23) = *(a1 + 23); + *(b + 24) = *(a1 + 24); + *(b + 25) = *(a1 + 25); + *(b + 26) = *(a1 + 26); + *(b + 27) = *(a1 + 27); + *(b + 28) = *(a1 + 28); + *(b + 29) = *(a1 + 29); + *(b + 30) = *(a1 + 30); + *(b + 31) = *(a1 + 31); + } + + b += 32; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_16.c b/kernel/generic/ztrsm_uncopy_16.c new file mode 100644 index 000000000..e84d96891 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_16.c @@ -0,0 +1,313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + *(b + 16) = *(a9 + 0); + *(b + 17) = *(a9 + 1); + *(b + 18) = *(a10 + 0); + *(b + 19) = *(a10 + 1); + *(b + 20) = *(a11 + 0); + *(b + 21) = *(a11 + 1); + *(b + 22) = *(a12 + 0); + *(b + 23) = *(a12 + 1); + *(b + 24) = *(a13 + 0); + *(b + 25) = *(a13 + 1); + *(b + 26) = *(a14 + 0); + *(b + 27) = *(a14 + 1); + *(b + 28) = *(a15 + 0); + *(b + 29) = *(a15 + 1); + *(b + 30) = *(a16 + 0); + *(b + 31) = *(a16 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + a9 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 1; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c index 08f85e891..5833a64ef 100644 --- a/kernel/generic/ztrsm_utcopy_1.c +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02; + FLOAT data01=0.0, data02=0.0; FLOAT *a1; lda *= 2; diff --git a/kernel/generic/ztrsm_utcopy_16.c b/kernel/generic/ztrsm_utcopy_16.c new file mode 100644 index 000000000..efcea5c3f --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_16.c @@ -0,0 +1,261 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, data1, data2; + + lda *= 2; + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 32; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + *(b + 16) = *(a1 + 16); + *(b + 17) = *(a1 + 17); + *(b + 18) = *(a1 + 18); + *(b + 19) = *(a1 + 19); + *(b + 20) = *(a1 + 20); + *(b + 21) = *(a1 + 21); + *(b + 22) = *(a1 + 22); + *(b + 23) = *(a1 + 23); + *(b + 24) = *(a1 + 24); + *(b + 25) = *(a1 + 25); + *(b + 26) = *(a1 + 26); + *(b + 27) = *(a1 + 27); + *(b + 28) = *(a1 + 28); + *(b + 29) = *(a1 + 29); + *(b + 30) = *(a1 + 30); + *(b + 31) = *(a1 + 31); + } + + b += 32; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c index 387bb2532..bc495f7c6 100644 --- a/kernel/generic/ztrsm_utcopy_2.c +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; + data01=data02=data07=data08=0.0; lda *= 2; jj = offset; diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 new file mode 100644 index 000000000..5b54a2ada --- /dev/null +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -0,0 +1,140 @@ +ifndef NO_LSX + +SDOTKERNEL = dot_lsx.S +DSDOTKERNEL = dot_lsx.S +DDOTKERNEL = dot_lsx.S +CDOTKERNEL = cdot_lsx.S +ZDOTKERNEL = cdot_lsx.S + +SSCALKERNEL = scal_lsx.S +DSCALKERNEL = scal_lsx.S +CSCALKERNEL = cscal_lsx.S +ZSCALKERNEL = cscal_lsx.S + +SAMAXKERNEL = amax_lsx.S +DAMAXKERNEL = amax_lsx.S +CAMAXKERNEL = camax_lsx.S +ZAMAXKERNEL = camax_lsx.S + +SAMINKERNEL = amin_lsx.S +DAMINKERNEL = amin_lsx.S +CAMINKERNEL = camin_lsx.S +ZAMINKERNEL = camin_lsx.S + +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S + +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S + +ISMAXKERNEL = imax_lsx.S +IDMAXKERNEL = imax_lsx.S + +ISMINKERNEL = imin_lsx.S +IDMINKERNEL = imin_lsx.S + +ISAMAXKERNEL = iamax_lsx.S +IDAMAXKERNEL = iamax_lsx.S +ICAMAXKERNEL = icamax_lsx.S +IZAMAXKERNEL = icamax_lsx.S + +ISAMINKERNEL = iamin_lsx.S +IDAMINKERNEL = iamin_lsx.S +ICAMINKERNEL = icamin_lsx.S +IZAMINKERNEL = icamin_lsx.S + +SCOPYKERNEL = copy_lsx.S +DCOPYKERNEL = copy_lsx.S +CCOPYKERNEL = ccopy_lsx.S +ZCOPYKERNEL = ccopy_lsx.S + +SSWAPKERNEL = swap_lsx.S +DSWAPKERNEL = swap_lsx.S + +SAXPYKERNEL = axpy_lsx.S +DAXPYKERNEL = axpy_lsx.S +CAXPYKERNEL = caxpy_lsx.S +ZAXPYKERNEL = caxpy_lsx.S + +SAXPBYKERNEL = axpby_lsx.S +DAXPBYKERNEL = axpby_lsx.S +CAXPBYKERNEL = caxpby_lsx.S +ZAXPBYKERNEL = caxpby_lsx.S + +SSUMKERNEL = sum_lsx.S +DSUMKERNEL = sum_lsx.S + +SASUMKERNEL = asum_lsx.S +DASUMKERNEL = asum_lsx.S +CASUMKERNEL = casum_lsx.S +ZASUMKERNEL = casum_lsx.S + +SROTKERNEL = rot_lsx.S +DROTKERNEL = rot_lsx.S +CROTKERNEL = crot_lsx.S +ZROTKERNEL = crot_lsx.S + +SNRM2KERNEL = snrm2_lsx.S +DNRM2KERNEL = dnrm2_lsx.S +CNRM2KERNEL = cnrm2_lsx.S +ZNRM2KERNEL = znrm2_lsx.S + +CSWAPKERNEL = cswap_lsx.S +ZSWAPKERNEL = cswap_lsx.S + +CSUMKERNEL = csum_lsx.S +ZSUMKERNEL = csum_lsx.S + +SGEMVNKERNEL = sgemv_n_lsx.S +SGEMVTKERNEL = sgemv_t_lsx.S + +DGEMVNKERNEL = dgemv_n_lsx.S +DGEMVTKERNEL = dgemv_t_lsx.S + +DGEMMKERNEL = dgemm_kernel_8x4.S +DGEMMINCOPY = dgemm_ncopy_8_lsx.S +DGEMMITCOPY = dgemm_tcopy_8_lsx.S +DGEMMONCOPY = dgemm_ncopy_4_lsx.S +DGEMMOTCOPY = dgemm_tcopy_4_lsx.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMVNKERNEL = cgemv_n_4_lsx.S +CGEMVTKERNEL = cgemv_t_4_lsx.S + +CGEMMKERNEL = cgemm_kernel_8x4_lsx.S +CGEMMINCOPY = cgemm_ncopy_8_lsx.S +CGEMMITCOPY = cgemm_tcopy_8_lsx.S +CGEMMONCOPY = cgemm_ncopy_4_lsx.S +CGEMMOTCOPY = cgemm_tcopy_4_lsx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMVNKERNEL = zgemv_n_2_lsx.S +ZGEMVTKERNEL = zgemv_t_2_lsx.S + +ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S +ZGEMMONCOPY = zgemm_ncopy_4_lsx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index cda359040..9b55d1bbb 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1,3 +1,90 @@ +ifndef NO_LASX + +SDOTKERNEL = dot_lasx.S +DSDOTKERNEL = dot_lasx.S +DDOTKERNEL = dot_lasx.S +CDOTKERNEL = cdot_lasx.S +ZDOTKERNEL = cdot_lasx.S + +SSCALKERNEL = scal_lasx.S +DSCALKERNEL = scal_lasx.S +CSCALKERNEL = cscal_lasx.S +ZSCALKERNEL = cscal_lasx.S + +SAMAXKERNEL = amax_lasx.S +DAMAXKERNEL = amax_lasx.S +CAMAXKERNEL = camax_lasx.S +ZAMAXKERNEL = camax_lasx.S + +SAMINKERNEL = amin_lasx.S +DAMINKERNEL = amin_lasx.S +CAMINKERNEL = camin_lasx.S +ZAMINKERNEL = camin_lasx.S + +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S + +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S + +ISMAXKERNEL = imax_lasx.S +IDMAXKERNEL = imax_lasx.S + +ISMINKERNEL = imin_lasx.S +IDMINKERNEL = imin_lasx.S + +ISAMAXKERNEL = iamax_lasx.S +IDAMAXKERNEL = iamax_lasx.S +ICAMAXKERNEL = icamax_lasx.S +IZAMAXKERNEL = icamax_lasx.S + +ISAMINKERNEL = iamin_lasx.S +IDAMINKERNEL = iamin_lasx.S +ICAMINKERNEL = icamin_lasx.S +IZAMINKERNEL = icamin_lasx.S + +SCOPYKERNEL = copy_lasx.S +DCOPYKERNEL = copy_lasx.S +CCOPYKERNEL = ccopy_lasx.S +ZCOPYKERNEL = ccopy_lasx.S + +SSWAPKERNEL = swap_lasx.S +DSWAPKERNEL = swap_lasx.S + +SAXPYKERNEL = axpy_lasx.S +DAXPYKERNEL = axpy_lasx.S +CAXPYKERNEL = caxpy_lasx.S +ZAXPYKERNEL = caxpy_lasx.S + +SAXPBYKERNEL = axpby_lasx.S +DAXPBYKERNEL = axpby_lasx.S +CAXPBYKERNEL = caxpby_lasx.S +ZAXPBYKERNEL = caxpby_lasx.S + +SSUMKERNEL = sum_lasx.S +DSUMKERNEL = sum_lasx.S + +SASUMKERNEL = asum_lasx.S +DASUMKERNEL = asum_lasx.S +CASUMKERNEL = casum_lasx.S +ZASUMKERNEL = casum_lasx.S + +SROTKERNEL = rot_lasx.S +DROTKERNEL = rot_lasx.S +CROTKERNEL = crot_lasx.S +ZROTKERNEL = crot_lasx.S + +SNRM2KERNEL = snrm2_lasx.S +DNRM2KERNEL = dnrm2_lasx.S +CNRM2KERNEL = cnrm2_lasx.S +ZNRM2KERNEL = znrm2_lasx.S + +CSWAPKERNEL = cswap_lasx.S +ZSWAPKERNEL = cswap_lasx.S + +CSUMKERNEL = csum_lasx.S +ZSUMKERNEL = csum_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S @@ -8,7 +95,65 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DGEMVNKERNEL = dgemv_n_8_lasx.S +DGEMVTKERNEL = dgemv_t_8_lasx.S + +SGEMMKERNEL = sgemm_kernel_16x8_lasx.S +SGEMMINCOPY = sgemm_ncopy_16_lasx.S +SGEMMITCOPY = sgemm_tcopy_16_lasx.S +SGEMMONCOPY = sgemm_ncopy_8_lasx.S +SGEMMOTCOPY = sgemm_tcopy_8_lasx.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SGEMVNKERNEL = sgemv_n_8_lasx.S +SGEMVTKERNEL = sgemv_t_8_lasx.S + +CGEMMKERNEL = cgemm_kernel_16x4_lasx.S +CGEMMINCOPY = cgemm_ncopy_16_lasx.S +CGEMMITCOPY = cgemm_tcopy_16_lasx.S +CGEMMONCOPY = cgemm_ncopy_4_lasx.S +CGEMMOTCOPY = cgemm_tcopy_4_lasx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMVNKERNEL = cgemv_n_8_lasx.S +CGEMVTKERNEL = cgemv_t_8_lasx.S + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S +ZGEMMINCOPY = zgemm_ncopy_8_lasx.S +ZGEMMITCOPY = zgemm_tcopy_8_lasx.S +ZGEMMONCOPY = zgemm_ncopy_4_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMVNKERNEL = zgemv_n_4_lasx.S +ZGEMVTKERNEL = zgemv_t_4_lasx.S + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S +DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic index b772a6f82..213add9ee 100644 --- a/kernel/loongarch64/KERNEL.generic +++ b/kernel/loongarch64/KERNEL.generic @@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c +ifndef DGEMVNKERNEL DGEMVNKERNEL = ../arm/gemv_n.c +endif CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c +ifndef DGEMVTKERNEL DGEMVTKERNEL = ../arm/gemv_t.c +endif CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S new file mode 100644 index 000000000..e964d4ddb --- /dev/null +++ b/kernel/loongarch64/amax_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM0, VM1 + XVFMAXA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S new file mode 100644 index 000000000..fb3b77a0e --- /dev/null +++ b/kernel/loongarch64/amax_lsx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM0, VM1 + VFMAXA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAXA VM1, VX0, VX1 + VFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S new file mode 100644 index 000000000..c91a33006 --- /dev/null +++ b/kernel/loongarch64/amin_lasx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM0, VM1 + XVFMINA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMINA VM1, VX0, VX1 + XVFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S new file mode 100644 index 000000000..47701b6e4 --- /dev/null +++ b/kernel/loongarch64/amin_lsx.S @@ -0,0 +1,230 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM0, VM1 + VFMINA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMINA VM1, VX0, VX1 + VFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S new file mode 100644 index 000000000..9a2c031f3 --- /dev/null +++ b/kernel/loongarch64/asum_lasx.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S new file mode 100644 index 000000000..512b01404 --- /dev/null +++ b/kernel/loongarch64/asum_lsx.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/axpby_lasx.S b/kernel/loongarch64/axpby_lasx.S new file mode 100644 index 000000000..7a246ca5c --- /dev/null +++ b/kernel/loongarch64/axpby_lasx.S @@ -0,0 +1,1055 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 +#define VXB $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXA, t1 + xvreplgr2vr.d VXB, t2 + xvreplgr2vr.d VXZ, t3 +#else + xvreplgr2vr.w VXA, t1 + xvreplgr2vr.w VXB, t2 + xvreplgr2vr.w VXZ, t3 +#endif + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX2, VX2, VXB, VX0 + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + xvld VX1, X, 4 * SIZE + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + xvfmadd.d VX2, VX2, VXB, VX0 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX0, Y, 0 * SIZE + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvst VX1, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + addi.d I, I, -1 + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/axpby_lsx.S b/kernel/loongarch64/axpby_lsx.S new file mode 100644 index 000000000..e50d4cdcc --- /dev/null +++ b/kernel/loongarch64/axpby_lsx.S @@ -0,0 +1,1153 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 +#define VXB $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXA, t1 + vreplgr2vr.d VXB, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXA, t1 + vreplgr2vr.w VXB, t2 + vreplgr2vr.w VXZ, t3 +#endif + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX2, VX2, VXA + vfmul.d VX3, VX3, VXA + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXB + vfmul.d VX1, VX1, VXB + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vfmul.d VX0, VX0, VXA + vld VX1, X, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vld VX0, X, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + vld VX1, X, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + vld VX2, Y, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vld VX3, Y, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + addi.d I, I, -1 + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/axpy_lasx.S b/kernel/loongarch64/axpy_lasx.S new file mode 100644 index 000000000..707fd09b5 --- /dev/null +++ b/kernel/loongarch64/axpy_lasx.S @@ -0,0 +1,529 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + MTG t1, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VXA, t1 +#else + xvreplgr2vr.w VXA, t1 +#endif + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + CMPEQ $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfadd.d VX2, VX0, VX2 + xvfadd.d VX3, VX1, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmadd.d VX2, VX0, VXA, VX2 + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX1, X, 4 * SIZE + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/axpy_lsx.S b/kernel/loongarch64/axpy_lsx.S new file mode 100644 index 000000000..0d74e2bce --- /dev/null +++ b/kernel/loongarch64/axpy_lsx.S @@ -0,0 +1,573 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + MTG t1, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VXA, t1 +#else + vreplgr2vr.w VXA, t1 +#endif + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + CMPEQ $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 +#endif + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 2 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vld VX0, X, 4 * SIZE + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 6 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE +#endif + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 2 * SIZE + vst VX2, Y, 0 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + vld VX2, Y, 4 * SIZE + vst VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 6 * SIZE + vst VX2, Y, 4 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S new file mode 100644 index 000000000..f9a4e9012 --- /dev/null +++ b/kernel/loongarch64/camax_lasx.S @@ -0,0 +1,212 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + xvxor.v VM0, VM0, VM0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L10 + .align 3 + +.L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMAX VM0, x1, x2 +#else + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + XVFMAX VM0, x1, x2 + XVFMAX VM1, x3, x4 + XVFMAX VM0, VM0, VM1 +#endif + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + addi.d I, I, -1 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + FMAX s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S new file mode 100644 index 000000000..cf46cb016 --- /dev/null +++ b/kernel/loongarch64/camax_lsx.S @@ -0,0 +1,239 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + vxor.v VM0, VM0, VM0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L10 + .align 3 + +.L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMAX VM0, x1, x2 +#else + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VFMAX VM1, x1, x2 + VFMAX VM0, x3, x4 + VFMAX VM0, VM0, VM1 +#endif + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + addi.d I, I, -1 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + FMAX s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S new file mode 100644 index 000000000..c1c4c98c8 --- /dev/null +++ b/kernel/loongarch64/camin_lasx.S @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define TEMP $r16 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + MTC s1, $r0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 +#else + xvreplve0.w VM0, VM0 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L10 + .align 3 + +.L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMIN VM0, x1, x2 +#else + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + XVFMIN VM0, x1, x2 + XVFMIN VM1, x3, x4 + XVFMIN VM0, VM0, VM1 +#endif + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + addi.d I, I, -1 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + FMIN s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S new file mode 100644 index 000000000..ff666ea8f --- /dev/null +++ b/kernel/loongarch64/camin_lsx.S @@ -0,0 +1,248 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + MTC s1, $r0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + vreplvei.d VM0, VM0, 0 +#else + vreplvei.w VM0, VM0, 0 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L10 + .align 3 + +.L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMIN VM0, x1, x2 +#else + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VFMIN VM1, x1, x2 + VFMIN VM0, x3, x4 + VFMIN VM0, VM0, VM1 +#endif + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + addi.d I, I, -1 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + FMIN s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S new file mode 100644 index 000000000..caf0ff969 --- /dev/null +++ b/kernel/loongarch64/casum_lasx.S @@ -0,0 +1,329 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res0 $xr19 +#define neg1 $xr20 +#define VT0 $xr21 +#define VT1 $xr22 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfmul.d VX0, neg1, VX2 + xvfmul.d VX1, neg1, VX3 + xvfcmp.clt.d VT0, VX2, res0 + xvfcmp.clt.d VT1, VX3, res0 + xvbitsel.v VX2, VX2, VX0, VT0 + xvbitsel.v VX3, VX3, VX1, VT1 + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S new file mode 100644 index 000000000..4822f2080 --- /dev/null +++ b/kernel/loongarch64/casum_lsx.S @@ -0,0 +1,358 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res0 $vr19 +#define neg1 $vr20 +#define VT0 $vr21 +#define VT1 $vr22 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vld VX0, X, 8 * SIZE + vld VX1, X, 12 * SIZE + addi.d I, I, -1 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res3, VX1, VX0 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfmul.s VX0, neg1, VX2 + vfmul.s VX1, neg1, VX3 + vfcmp.clt.s VT0, VX2, res0 + vfcmp.clt.s VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpby_lasx.S b/kernel/loongarch64/caxpby_lasx.S new file mode 100644 index 000000000..c5802092e --- /dev/null +++ b/kernel/loongarch64/caxpby_lasx.S @@ -0,0 +1,1046 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r5 +#define INCX $r6 +#define BETAR $f2 +#define BETAI $f3 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXBR $xr14 +#define VXBI $xr13 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI + MTG t3, BETAR + MTG t4, BETAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + xvreplgr2vr.d VXBR, t3 + xvreplgr2vr.d VXBI, t4 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + xvreplgr2vr.w VXBR, t3 + xvreplgr2vr.w VXBI, t4 +#endif + xvxor.v VXZ, VXZ, VXZ + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, BETAR, a1 + CMPEQ $fcc1, BETAI, a1 + CMPEQ $fcc2, ALPHAR, a1 + CMPEQ $fcc3, ALPHAI, a1 + bceqz $fcc0, .L13 + bceqz $fcc1, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc2, .L114 + bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + +.L14: + bceqz $fcc2, .L112 + bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFMUL x3, VXAI, x2 + XVFMUL x4, VXAI, x1 + XVMSUB x3, VXAR, x1, x3 + XVFMADD x4, VXAR, x2, x4 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvld VX0, Y, 0 * SIZE + xvld VX1, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, Y, 0 * SIZE + xvld VX1, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFMUL x3, VXBI, x2 + XVFMUL x4, VXBI, x1 + XVMSUB x3, VXBR, x1, x3 + XVFMADD x4, VXBR, x2, x4 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#else + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + xvld VX1, X, 8 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + add.d Y, Y, INCY + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#endif + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#else + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + xvld VX3, Y, 8 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#endif + +.L22: + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, BETAR, a1 + CMPEQ $fcc1, BETAI, a1 + CMPEQ $fcc2, ALPHAR, a1 + CMPEQ $fcc3, ALPHAI, a1 + bceqz $fcc0, .L23 + bceqz $fcc1, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc2, .L224 + bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L24: + bceqz $fcc2, .L222 + bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#else + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#endif + +.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvfmul.d x3, VXAI, x2 + xvfmul.d x4, VXAI, x1 + xvfmsub.d x3, VXAR, x1, x3 + xvfmadd.d x4, VXAR, x2, x4 + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#else + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + XVFMUL x3, VXAI, x2 + XVFMUL x4, VXAI, x1 + XVMSUB x3, VXAR, x1, x3 + XVFMADD x4, VXAR, x2, x4 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#endif + +.L223: +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d Y, Y, INCY + xvfmul.d x3, VXBI, x2 + xvfmul.d x4, VXBI, x1 + xvfmsub.d x3, VXBR, x1, x3 + xvfmadd.d x4, VXBR, x2, x4 + + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#else + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d Y, Y, INCY + + XVFMUL x3, VXBI, x2 + XVFMUL x4, VXBI, x1 + XVMSUB x3, VXBR, x1, x3 + XVFMADD x4, VXBR, x2, x4 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#endif + +.L224: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + addi.d I, I, -1 + + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#else + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + addi.d I, I, -1 + + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#endif + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MUL s3, BETAI, a4 + MUL s4, BETAI, a3 + MSUB s1, ALPHAR, a1, s1 + MADD s2, a2, ALPHAR, s2 + MSUB s3, BETAR, a3, s3 + MADD s4, a4, BETAR, s4 + ADD s3, s3, s1 + ADD s4, s4, s2 + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpby_lsx.S b/kernel/loongarch64/caxpby_lsx.S new file mode 100644 index 000000000..247ae428e --- /dev/null +++ b/kernel/loongarch64/caxpby_lsx.S @@ -0,0 +1,1029 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r5 +#define INCX $r6 +#define BETAR $f2 +#define BETAI $f3 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXBR $vr14 +#define VXBI $vr13 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 +#ifdef DOUBLE + ffint.d.l a1, a1 +#else + ffint.s.l a1, a1 +#endif + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, ALPHAR + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 + movfr2gr.d t3, BETAR + vreplgr2vr.d VXBR, t3 + movfr2gr.d t4, BETAI + vreplgr2vr.d VXBI, t4 +#else + movfr2gr.s t1, ALPHAR + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 + movfr2gr.s t3, BETAR + vreplgr2vr.w VXBR, t3 + movfr2gr.s t4, BETAI + vreplgr2vr.w VXBI, t4 +#endif + vxor.v VXZ, VXZ, VXZ + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 +#ifdef DOUBLE + fcmp.ceq.d $fcc0, BETAR, a1 + fcmp.ceq.d $fcc1, BETAI, a1 + fcmp.ceq.d $fcc2, ALPHAR, a1 + fcmp.ceq.d $fcc3, ALPHAI, a1 +#else + fcmp.ceq.s $fcc0, BETAR, a1 + fcmp.ceq.s $fcc1, BETAI, a1 + fcmp.ceq.s $fcc2, ALPHAR, a1 + fcmp.ceq.s $fcc3, ALPHAI, a1 +#endif + bceqz $fcc0, .L13 + bceqz $fcc1, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc2, .L114 + bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + +.L14: + bceqz $fcc2, .L112 + bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 +#else + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 +#endif + +.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfmul.s x4, VXAI, x1 + vfmsub.s x3, VXAR, x1, x3 + vfmadd.s x4, VXAR, x2, x4 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 +#endif + +.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, Y, 4 * SIZE + vld VX1, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 +#else + vld VX0, Y, 0 * SIZE + vld VX1, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXBI, x2 + vfmul.s x4, VXBI, x1 + vfmsub.s x3, VXBR, x1, x3 + vfmadd.s x4, VXBR, x2, x4 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 +#endif + +.L114: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 +#endif + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#endif + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#endif + +.L22: + bge $r0, I, .L997 + move YY, Y +#ifdef DOUBLE + fcmp.ceq.d $fcc0, BETAR, a1 + fcmp.ceq.d $fcc1, BETAI, a1 + fcmp.ceq.d $fcc2, ALPHAR, a1 + fcmp.ceq.d $fcc3, ALPHAI, a1 +#else + fcmp.ceq.s $fcc0, BETAR, a1 + fcmp.ceq.s $fcc1, BETAI, a1 + fcmp.ceq.s $fcc2, ALPHAR, a1 + fcmp.ceq.s $fcc3, ALPHAI, a1 +#endif + bceqz $fcc0, .L23 + bceqz $fcc1, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc2, .L224 + bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L24: + bceqz $fcc2, .L222 + bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#else + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#endif + +.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vfmul.s x3, VXAI, x2 + vfmul.s x4, VXAI, x1 + vfmsub.s x3, VXAR, x1, x3 + vfmadd.s x4, VXAR, x2, x4 + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#endif + +.L223: +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d Y, Y, INCY + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d Y, Y, INCY + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d Y, Y, INCY + vfmul.s x3, VXBI, x2 + vfmul.s x4, VXBI, x1 + vfmsub.s x3, VXBR, x1, x3 + vfmadd.s x4, VXBR, x2, x4 + + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#endif + +.L224: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + addi.d I, I, -1 + + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#endif + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: +#ifdef DOUBLE + fld.d a1, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fld.d a3, Y, 0 * SIZE + fld.d a4, Y, 1 * SIZE + addi.d I, I, -1 + fmul.d s1, ALPHAI, a2 + fmul.d s2, ALPHAI, a1 + fmul.d s3, BETAI, a4 + fmul.d s4, BETAI, a3 + fmsub.d s1, ALPHAR, a1, s1 + fmadd.d s2, a2, ALPHAR, s2 + fmsub.d s3, BETAR, a3, s3 + fmadd.d s4, a4, BETAR, s4 + fadd.d s3, s3, s1 + fadd.d s4, s4, s2 + fst.d s3, Y, 0 * SIZE + fst.d s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 +#else + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + fld.s a3, Y, 0 * SIZE + fld.s a4, Y, 1 * SIZE + addi.d I, I, -1 + fmul.s s1, ALPHAI, a2 + fmul.s s2, ALPHAI, a1 + fmul.s s3, BETAI, a4 + fmul.s s4, BETAI, a3 + fmsub.s s1, ALPHAR, a1, s1 + fmadd.s s2, a2, ALPHAR, s2 + fmsub.s s3, BETAR, a3, s3 + fmadd.s s4, a4, BETAR, s4 + fadd.s s3, s3, s1 + fadd.s s4, s4, s2 + fst.s s3, Y, 0 * SIZE + fst.s s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 +#endif +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S new file mode 100644 index 000000000..2b970fe70 --- /dev/null +++ b/kernel/loongarch64/caxpy_lasx.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE +#endif + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif + addi.d I, I, -1 +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 +#else + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S new file mode 100644 index 000000000..85598d0b9 --- /dev/null +++ b/kernel/loongarch64/caxpy_lsx.S @@ -0,0 +1,679 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + vreplgr2vr.w VXAI, t2 +#endif + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 +#endif + add.d Y, Y, INCY +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S new file mode 100644 index 000000000..fbc5d96bc --- /dev/null +++ b/kernel/loongarch64/ccopy_lasx.S @@ -0,0 +1,386 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0 * SIZE, 2 + xvstelm.d VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 0 + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 2 + xvstelm.d VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 0 + xvstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 2 + xvstelm.d VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 0 + xvstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 2 + xvstelm.d VX3, Y, 1 * SIZE, 3 +#else + xvld VX1, X, 8 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 2 + xvstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 4 + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 6 + xvstelm.w VX0, Y, 1 * SIZE, 7 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 0 + xvstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 2 + xvstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 4 + xvstelm.w VX1, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 6 + xvstelm.w VX1, Y, 1 * SIZE, 7 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S new file mode 100644 index 000000000..4c4d880f1 --- /dev/null +++ b/kernel/loongarch64/ccopy_lsx.S @@ -0,0 +1,411 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11:// INCX==1 and INCY==1 + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + addi.d I, I, -1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + vst VX2, Y, 12 * SIZE + vst VX3, Y, 14 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0 * SIZE, 2 + vstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 0 + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 2 + vstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 0 + vstelm.w VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 2 + vstelm.w VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 0 + vstelm.w VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 2 + vstelm.w VX3, Y, 1 * SIZE, 3 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 12 * SIZE + vst VX1, Y, 14 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d Y, Y, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S new file mode 100644 index 000000000..0583e56ea --- /dev/null +++ b/kernel/loongarch64/cdot_lasx.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res4 $xr19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define x1 $xr20 +#define x2 $xr21 +#define x3 $xr22 +#define x4 $xr23 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res3, res3, res3 + xvxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + addi.d X, X, 8 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + addi.d X, X, 16 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d Y, Y, 8 * SIZE + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d Y, Y, 8 * SIZE + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + xvpickve.d VX1, res2, 1 + xvpickve.d VX2, res2, 2 + xvpickve.d VX3, res2, 3 + xvfadd.d res2, VX1, res2 + xvfadd.d res2, VX2, res2 + xvfadd.d res2, VX3, res2 + xvpickve.d VX1, res3, 1 + xvpickve.d VX2, res3, 2 + xvpickve.d VX3, res3, 3 + xvfadd.d res3, VX1, res3 + xvfadd.d res3, VX2, res3 + xvfadd.d res3, VX3, res3 + xvpickve.d VX1, res4, 1 + xvpickve.d VX2, res4, 2 + xvpickve.d VX3, res4, 3 + xvfadd.d res4, VX1, res4 + xvfadd.d res4, VX2, res4 + xvfadd.d res4, VX3, res4 +#else + xvpickve.w VX0, res1, 1 + xvpickve.w VX1, res1, 2 + xvpickve.w VX2, res1, 3 + xvpickve.w VX3, res1, 4 + xvpickve.w x1, res1, 5 + xvpickve.w x2, res1, 6 + xvpickve.w x3, res1, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, x1, res1 + xvfadd.s res1, x2, res1 + xvfadd.s res1, x3, res1 + xvpickve.w VX0, res2, 1 + xvpickve.w VX1, res2, 2 + xvpickve.w VX2, res2, 3 + xvpickve.w VX3, res2, 4 + xvpickve.w x1, res2, 5 + xvpickve.w x2, res2, 6 + xvpickve.w x3, res2, 7 + xvfadd.s res2, VX0, res2 + xvfadd.s res2, VX1, res2 + xvfadd.s res2, VX2, res2 + xvfadd.s res2, VX3, res2 + xvfadd.s res2, x1, res2 + xvfadd.s res2, x2, res2 + xvfadd.s res2, x3, res2 + xvpickve.w VX0, res3, 1 + xvpickve.w VX1, res3, 2 + xvpickve.w VX2, res3, 3 + xvpickve.w VX3, res3, 4 + xvpickve.w x1, res3, 5 + xvpickve.w x2, res3, 6 + xvpickve.w x3, res3, 7 + xvfadd.s res3, VX0, res3 + xvfadd.s res3, VX1, res3 + xvfadd.s res3, VX2, res3 + xvfadd.s res3, VX3, res3 + xvfadd.s res3, x1, res3 + xvfadd.s res3, x2, res3 + xvfadd.s res3, x3, res3 + xvpickve.w VX0, res4, 1 + xvpickve.w VX1, res4, 2 + xvpickve.w VX2, res4, 3 + xvpickve.w VX3, res4, 4 + xvpickve.w x1, res4, 5 + xvpickve.w x2, res4, 6 + xvpickve.w x3, res4, 7 + xvfadd.s res4, VX0, res4 + xvfadd.s res4, VX1, res4 + xvfadd.s res4, VX2, res4 + xvfadd.s res4, VX3, res4 + xvfadd.s res4, x1, res4 + xvfadd.s res4, x2, res4 + xvfadd.s res4, x3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S new file mode 100644 index 000000000..5feea12be --- /dev/null +++ b/kernel/loongarch64/cdot_lsx.S @@ -0,0 +1,397 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res4 $vr19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define x1 $vr20 +#define x2 $vr21 +#define x3 $vr22 +#define x4 $vr23 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res3, res3, res3 + vxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 1 +#else + srai.d I, N, 2 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + addi.d X, X, 4 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + addi.d X, X, 8 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + addi.d Y, Y, 4 * SIZE + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d Y, Y, 8 * SIZE + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + vreplvei.d VX1, res2, 1 + vfadd.d res2, VX1, res2 + vreplvei.d VX1, res3, 1 + vfadd.d res3, VX1, res3 + vreplvei.d VX1, res4, 1 + vfadd.d res4, VX1, res4 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + vreplvei.w VX1, res2, 1 + vreplvei.w VX2, res2, 2 + vreplvei.w VX3, res2, 3 + vfadd.s res2, VX1, res2 + vfadd.s res2, VX2, res2 + vfadd.s res2, VX3, res2 + vreplvei.w VX1, res3, 1 + vreplvei.w VX2, res3, 2 + vreplvei.w VX3, res3, 3 + vfadd.s res3, VX1, res3 + vfadd.s res3, VX2, res3 + vfadd.s res3, VX3, res3 + vreplvei.w VX1, res4, 1 + vreplvei.w VX2, res4, 2 + vreplvei.w VX3, res4, 3 + vfadd.s res4, VX1, res4 + vfadd.s res4, VX2, res4 + vfadd.s res4, VX3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cgemm_kernel_16x4_lasx.S b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S new file mode 100644 index 000000000..249abe102 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S @@ -0,0 +1,3757 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvpermi.d U0, U0, 0xd8 + xvpermi.d U2, U2, 0xd8 + + xvst U0, TD, 0x00 + xvst U2, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S new file mode 100644 index 000000000..1cf4d87dc --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S @@ -0,0 +1,202 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + slli.d T0, TL, 0x01 + + srai.d I, N, 0x01 + beq I, ZERO, .L_N0 + +.L_J1: /* if (i > 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_4_lasx.S b/kernel/loongarch64/cgemm_ncopy_4_lasx.S new file mode 100644 index 000000000..24c2e6748 --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_4_lasx.S @@ -0,0 +1,325 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8 + xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16 + xvld U2, S3, 0x00 //17 18 19 20 21 22 23 24 + xvld U3, S4, 0x00 //25 26 27 28 29 30 31 32 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14 + xvshuf4i.d D2, U3, 0x88 //17 18 25 26 21 22 29 30 + xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16 + xvshuf4i.d D3, U2, 0x77 //19 20 27 28 23 24 31 32 + + xvand.v U4, D0, D0 + xvand.v U5, D1, D1 + + xvpermi.q U4, D2, 0x02 //1 2 9 10 17 18 25 26 + xvpermi.q U5, D3, 0x02 //3 4 11 12 19 20 27 28 + xvpermi.q D2, D0, 0x31 //5 6 13 14 21 22 29 30 + xvpermi.q D3, D1, 0x31 //7 8 15 16 23 24 31 32 + + xvst U4, TD, 0x00 + xvst U5, TD, 0x20 + xvst D2, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vand.v $vr8, $vr1, $vr1 + vand.v $vr9, $vr1, $vr1 + vand.v $vr10, $vr3, $vr3 + vand.v $vr11, $vr3, $vr3 + + vpermi.w $vr8, $vr0, 0x44 + vpermi.w $vr10, $vr2, 0x44 + vpermi.w $vr9, $vr0, 0xee + vpermi.w $vr11, $vr2, 0xee + + vst $vr8, TD, 0x00 + vst $vr10, TD, 0x10 + vst $vr9, TD, 0x20 + vst $vr11, TD, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d TD, TD, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8 + xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + + xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14 + xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16 + + xvand.v U4, D0, D0 + + xvpermi.q U4, D1, 0x02 //1 2 9 10 3 4 11 12 + xvpermi.q D1, D0, 0x31 //5 6 13 14 7 8 15 16 + + xvst U4, TD, 0x00 + xvst D1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vand.v $vr8, $vr1, $vr1 + + vpermi.w $vr8, $vr0, 0x44 + vpermi.w $vr1, $vr0, 0xee + + vst $vr8, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_4_lsx.S b/kernel/loongarch64/cgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..bfc712fb2 --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_4_lsx.S @@ -0,0 +1,341 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + vand.v D4, U6, U6 + vand.v D5, U7, U7 + vand.v D6, U6, U6 + vand.v D7, U7, U7 + + vpermi.w D0, U0, 0x44 + vpermi.w D4, U4, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D6, U4, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D5, U5, 0x44 + vpermi.w D3, U1, 0xee + vpermi.w D7, U5, 0xee + + vst D0, TD, 0x00 + vst D4, TD, 0x10 + vst D2, TD, 0x20 + vst D6, TD, 0x30 + vst D1, TD, 0x40 + vst D5, TD, 0x50 + vst D3, TD, 0x60 + vst D7, TD, 0x70 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vand.v D0, U1, U1 + vand.v D1, U1, U1 + vand.v D2, U3, U3 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U2, 0x44 + vpermi.w D1, U0, 0xee + vpermi.w D3, U2, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d TD, TD, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vand.v D0, U1, U1 + + vpermi.w D0, U0, 0x44 + vpermi.w U1, U0, 0xee + + vst D0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..87a88e37d --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + fld.s F0, S5, 0x00 + fld.s F1, S5, 0x04 + fld.s F2, S6, 0x00 + fld.s F3, S6, 0x04 + fld.s F4, S7, 0x00 + fld.s F5, S7, 0x04 + fld.s F6, S8, 0x00 + fld.s F7, S8, 0x04 + + fst.s F0, TD, 0x20 + fst.s F1, TD, 0x24 + fst.s F2, TD, 0x28 + fst.s F3, TD, 0x2c + fst.s F4, TD, 0x30 + fst.s F5, TD, 0x34 + fst.s F6, TD, 0x38 + fst.s F7, TD, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d S1, S1, 0x08 + addi.d TD, TD, 0x08 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_16_lasx.S b/kernel/loongarch64/cgemm_tcopy_16_lasx.S new file mode 100644 index 000000000..7d9eb94c8 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_16_lasx.S @@ -0,0 +1,741 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +#define F8 $f8 +#define F9 $f9 +#define F10 $f10 +#define F11 $f11 +#define F12 $f12 +#define F13 $f13 +#define F14 $f14 +#define F15 $f15 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x04 + srai.d T1, N, 0x03 + slli.d T0, T0, 0x04 + slli.d T1, T1, 0x03 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x02 + srai.d T1, N, 0x01 + slli.d T0, T0, 0x02 + slli.d T1, T1, 0x01 + mul.d P4, M, T0 + mul.d P5, M, T1 + slli.d P4, P4, 0x03 + slli.d P5, P5, 0x03 + add.d P4, DST, P4 + add.d P5, DST, P5 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x07 + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x400 + + srai.d I, N, 0x04 + addi.d J, J, -1 + beq ZERO, I, .L_N15 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S5, 0x40 + xvld U3, S5, 0x60 + xvld U4, S6, 0x00 + xvld U5, S6, 0x20 + xvld U6, S6, 0x40 + xvld U7, S6, 0x60 + + xvst U0, P1, 0x200 + xvst U1, P1, 0x220 + xvst U2, P1, 0x240 + xvst U3, P1, 0x260 + xvst U4, P1, 0x280 + xvst U5, P1, 0x2A0 + xvst U6, P1, 0x2C0 + xvst U7, P1, 0x2E0 + + xvld U0, S7, 0x00 + xvld U1, S7, 0x20 + xvld U2, S7, 0x40 + xvld U3, S7, 0x60 + xvld U4, S8, 0x00 + xvld U5, S8, 0x20 + xvld U6, S8, 0x40 + xvld U7, S8, 0x60 + + xvst U0, P1, 0x300 + xvst U1, P1, 0x320 + xvst U2, P1, 0x340 + xvst U3, P1, 0x360 + xvst U4, P1, 0x380 + xvst U5, P1, 0x3A0 + xvst U6, P1, 0x3C0 + xvst U7, P1, 0x3E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d S5, S5, 0x80 + addi.d S6, S6, 0x80 + addi.d S7, S7, 0x80 + addi.d S8, S8, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U0, P2, 0x100 + xvst U1, P2, 0x120 + xvst U2, P2, 0x140 + xvst U3, P2, 0x160 + xvst U4, P2, 0x180 + xvst U5, P2, 0x1A0 + xvst U6, P2, 0x1C0 + xvst U7, P2, 0x1E0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d P2, P2, 0x200 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + xvst U4, P3, 0x80 + xvst U5, P3, 0xA0 + xvst U6, P3, 0xC0 + xvst U7, P3, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P3, P3, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + xvst U4, P4, 0x40 + xvst U6, P4, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P4, P4, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fld.s F8, S5, 0x00 + fld.s F9, S5, 0x04 + + fld.s F10, S6, 0x00 + fld.s F11, S6, 0x04 + + fld.s F12, S7, 0x00 + fld.s F13, S7, 0x04 + + fld.s F14, S8, 0x00 + fld.s F15, S8, 0x04 + + fst.s F0, P5, 0x00 + fst.s F1, P5, 0x04 + fst.s F2, P5, 0x08 + fst.s F3, P5, 0x0c + fst.s F4, P5, 0x10 + fst.s F5, P5, 0x14 + fst.s F6, P5, 0x18 + fst.s F7, P5, 0x1c + fst.s F8, P5, 0x20 + fst.s F9, P5, 0x24 + fst.s F10, P5, 0x28 + fst.s F11, P5, 0x2c + fst.s F12, P5, 0x30 + fst.s F13, P5, 0x34 + fst.s F14, P5, 0x38 + fst.s F15, P5, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P5, P5, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x04 + beq ZERO, I, .L_4N15 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N15: + andi I, N, 0x08 + beq ZERO, I, .L_4N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d P2, P2, 0x100 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P3, P3, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P4, P4, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P5, P5, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x04 + beq ZERO, I, .L_2N15 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N15: + andi I, N, 0x08 + beq ZERO, I, .L_2N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d P2, P2, 0x80 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P3, P3, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P4, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P4, P4, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P5, P5, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x04 + beq ZERO, I, .L_1N15 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N15: + andi I, N, 0x08 + beq ZERO, I, .L_1N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x40 + addi.d P2, P2, 0x40 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x20 + addi.d P3, P3, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x10 + addi.d P4, P4, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P5, 0x00 + + addi.d S1, S1, 0x08 + addi.d P5, P5, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..e2245e412 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvst U0, S8, 0x00 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + xvst U2, S8, 0x00 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S8, 0x00 + vst $vr1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld $vr0, TS, 0x00 + vld $vr1, TS, 0x10 + + vst $vr0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst $vr1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, TS, 0x00 + + vst $vr0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S new file mode 100644 index 000000000..15c0fde8f --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U2, S8, 0x10 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + vst U1, S8, 0x00 + vst U3, S8, 0x10 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, TS, 0x00 + + vst U0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lasx.S b/kernel/loongarch64/cgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..9ff8a35b8 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lasx.S @@ -0,0 +1,306 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + vst $vr2, S9, 0x20 + vst $vr3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + + xvst U0, S8, 0x00 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld $vr0, S1, 0x00 + + vst $vr0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..6d63d62e7 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S @@ -0,0 +1,324 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + + vst U0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..2935bbc07 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S @@ -0,0 +1,277 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + vst U4, TD, 0x40 + vst U5, TD, 0x50 + vst U6, TD, 0x60 + vst U7, TD, 0x70 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x10 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d TD, TD, 0x10 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemv_n_4_lsx.S b/kernel/loongarch64/cgemv_n_4_lsx.S new file mode 100644 index 000000000..cf8273797 --- /dev/null +++ b/kernel/loongarch64/cgemv_n_4_lsx.S @@ -0,0 +1,323 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $vr1 +#define X0 $vr2 +#define X1 $vr3 +#define X2 $vr4 +#define X3 $vr5 +#define X4 $vr6 +#define X5 $vr7 +#define X6 $vr8 +#define X7 $vr9 +#define Y0 $vr10 +#define Y1 $vr11 +#define A0 $vr12 +#define A1 $vr13 +#define A2 $vr14 +#define A3 $vr15 +#define A4 $vr16 +#define A5 $vr17 +#define A6 $vr18 +#define A7 $vr19 +#define A8 $vr20 +#define A9 $vr21 +#define A10 $vr22 +#define A11 $vr23 +#define A12 $vr24 +#define A13 $vr25 +#define A14 $vr26 +#define A15 $vr27 +#define TMP0 $vr28 +#define TMP1 $vr29 +#define TMP2 $vr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro CLOAD_X_4 + GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_4_GAP + vldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + vldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + vldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + vldrepl.d X3, T0, 0x00 + + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_1 + GLDREPL v, d, X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_4 + GLD v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro CLOAD_Y_4_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + vpackev.d Y0, A1, Y0 + vpackev.d Y1, A5, Y1 +.endm + +.macro CLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro CSTORE_Y_4 + GST v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro CSTORE_Y_4_GAP + vstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + vstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + vstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + vstelm.d Y1, T0, 0, 1 +.endm + +.macro CSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +.macro CGEMV_N_4x4 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_1x4 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L4: + CLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + CLOAD_\Y_4 + CGEMV_N_4x4 + CSTORE_\Y_4 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x4 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END +.L_\XW\()_N_L1: + CLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x1 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + vpackev.w $vr0, $vr1, $vr0 + vpackev.d VALPHA, $vr0, $vr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S new file mode 100644 index 000000000..ba38a9573 --- /dev/null +++ b/kernel/loongarch64/cgemv_n_8_lasx.S @@ -0,0 +1,383 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 +#define TMP0 $xr28 +#define TMP1 $xr29 +#define TMP2 $xr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro CLOAD_X_8 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ + X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2, \ + X4, VALPHA, X4, TMP0, TMP1, TMP2, \ + X5, VALPHA, X5, TMP0, TMP1, TMP2, \ + X6, VALPHA, X6, TMP0, TMP1, TMP2, \ + X7, VALPHA, X7, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_8_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X7, T0, 0x00 + + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2, \ + X4, VALPHA, X4, TMP0, TMP1, TMP2, \ + X5, VALPHA, X5, TMP0, TMP1, TMP2, \ + X6, VALPHA, X6, TMP0, TMP1, TMP2, \ + X7, VALPHA, X7, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_8 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro CLOAD_Y_8_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.d $f18, T0, 0 + fldx.d $f19, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 +.endm + +.macro CSTORE_Y_8_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 3 +.endm + +.macro CGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \ + Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \ + Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \ + Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \ + Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2 +.endm + +.macro CSTORE_Y_8 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro CLOAD_X_1 + GLDREPL xv, d, X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro CGEMV_N_1x8 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ + $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \ + Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \ + Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \ + Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \ + Y0, X7, A14, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +.macro CGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + CLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + CLOAD_\Y_8 + CGEMV_N_8x8 + CSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x8 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 7 + beqz J, .L_END +.L_\XW\()_N_L1: + CLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x1 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + xvpackev.w $xr0, $xr1, $xr0 + xvreplve0.d VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_4_lsx.S b/kernel/loongarch64/cgemv_t_4_lsx.S new file mode 100644 index 000000000..ada349364 --- /dev/null +++ b/kernel/loongarch64/cgemv_t_4_lsx.S @@ -0,0 +1,290 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $vr0 +#define X0 $vr1 +#define X1 $vr2 +#define A0 $vr3 +#define A1 $vr4 +#define A2 $vr5 +#define A3 $vr6 +#define A4 $vr7 +#define A5 $vr8 +#define A6 $vr9 +#define A7 $vr10 +#define A8 $vr11 +#define A9 $vr12 +#define A10 $vr13 +#define A11 $vr14 +#define A12 $vr15 +#define A13 $vr16 +#define A14 $vr17 +#define A15 $vr18 +#define TP0 $vr19 +#define TP1 $vr20 +#define TP2 $vr21 +#define TP3 $vr22 +#define TP4 $vr23 +#define TP5 $vr24 +#define TP6 $vr25 +#define TP7 $vr26 +#define TMP0 $vr27 +#define TMP1 $vr28 +#define TMP2 $vr29 +#define Y0 $vr3 +#define Y1 $vr4 +#define Y2 $vr5 +#define Y3 $vr6 +#define Y4 $vr7 +#define Y5 $vr8 +#define Y6 $vr9 +#define Y7 $vr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y4 + GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y1 + GXOR v, v, TP0, TP0, TP0 +.endm + +.macro CLOAD_X4 + GLD v, , X0, X, 0x00, X1, X, 0x10 +.endm + +.macro CLOAD_X4_GAP + fld.d $f1, X, 0x00 + fldx.d $f3, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f2, T0, 0x00 + fldx.d $f4, T0, INC_X + vpackev.d X0, A0, X0 + vpackev.d X1, A1, X1 +.endm + +.macro CGEMV_T_4x4 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_T_LSX XW:req, X4:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L4: + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + CLOAD_\X4 + CGEMV_T_4x4 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 2 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + // Accumulated + GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M8 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + fld.d $f3, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + fst.d $f3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + vpackev.w $vr0, $vr1, $vr0 + vpackev.d VALPHA, $vr0, $vr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + CGEMV_T_LSX GAP_0, X4 +.L_GAP_1: /* if (incx != 1) */ + CGEMV_T_LSX GAP_1, X4_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_8_lasx.S b/kernel/loongarch64/cgemv_t_8_lasx.S new file mode 100644 index 000000000..94e4bd2eb --- /dev/null +++ b/kernel/loongarch64/cgemv_t_8_lasx.S @@ -0,0 +1,342 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2022/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define TMP0 $xr27 +#define TMP1 $xr28 +#define TMP2 $xr29 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro CLOAD_X8 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro CLOAD_X8_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.d $f2, T0, 0x00 + fldx.d $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.d $f4, T0, 0x00 + fldx.d $f5, T0, INC_X + GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 +.endm + +.macro CGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2, \ + TP4, A8, X0, TP4, TMP0, TMP1, TMP2, TP4, A9, X1, TP4, TMP0, TMP1, TMP2, \ + TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \ + TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \ + TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_T_LASX XW:req, X8:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + CLOAD_\X8 + CGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + // Accumulated + GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 + fld.d $f15, PA4, 0x00 + fld.d $f16, PA5, 0x00 + fld.d $f17, PA6, 0x00 + fld.d $f18, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \ + A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \ + A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.d $f15, PY1, 0x00 + fldx.d $f16, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.d $f17, PY2, 0x00 + fldx.d $f18, PY2, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\ + A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\ + A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + fst.d $f15, PY1, 0x00 + fstx.d $f16, PY1, INC_Y + fst.d $f17, PY2, 0x00 + fstx.d $f18, PY2, INC_Y + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 7 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M8 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + fld.d $f3, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + fst.d $f3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + xvpackev.w $xr0, $xr1, $xr0 + xvreplve0.d VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + CGEMV_T_LASX GAP_0, X8 +.L_GAP_1: /* if (incx != 1) */ + CGEMV_T_LASX GAP_1, X8_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S index 9d27987e1..41667485a 100644 --- a/kernel/loongarch64/cnrm2.S +++ b/kernel/loongarch64/cnrm2.S @@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 srai.d I, N, 2 bge $r0, I, .L25 LD a1, X, 0 * SIZE diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S new file mode 100644 index 000000000..3a60069ac --- /dev/null +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, res + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S new file mode 100644 index 000000000..20950ba17 --- /dev/null +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -0,0 +1,155 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vld VX0, X, 4 * SIZE + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S new file mode 100644 index 000000000..31f91cec1 --- /dev/null +++ b/kernel/loongarch64/copy_lasx.S @@ -0,0 +1,306 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + addi.d I, I, -1 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY +#else + xvld VX0, X, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 32 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S new file mode 100644 index 000000000..bb10f3565 --- /dev/null +++ b/kernel/loongarch64/copy_lsx.S @@ -0,0 +1,316 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 32 + vld VX1, X, 48 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY +#else + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 + vst VX1, Y, 16 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 32 + vst VX1, Y, 48 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 16 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S new file mode 100644 index 000000000..d4ec1e22c --- /dev/null +++ b/kernel/loongarch64/crot_lasx.S @@ -0,0 +1,1079 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr11 +#define x1 $xr12 +#define x2 $xr13 +#define x3 $xr14 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXC, t1 + xvreplgr2vr.d VXS, t2 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXC, t1 + xvreplgr2vr.w VXS, t2 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + xvst VXZ, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 +#else + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 + .align 3 + +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S new file mode 100644 index 000000000..126257edc --- /dev/null +++ b/kernel/loongarch64/crot_lsx.S @@ -0,0 +1,907 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr11 +#define x1 $vr12 +#define x2 $vr13 +#define x3 $vr14 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXC, t1 + vreplgr2vr.d VXS, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXC, t1 + vreplgr2vr.w VXS, t2 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 2 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: +#ifdef DOUBLE + srai.d I, N, 2 +#endif + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 +#endif + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 +#endif + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 +#endif + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L995 +#else + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 +#endif + .align 3 + +#ifdef DOUBLE + .L995: + andi I, N, 3 + bge $r0, I, .L999 + b .L998 + .align 3 + +#endif +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S new file mode 100644 index 000000000..f53526663 --- /dev/null +++ b/kernel/loongarch64/cscal_lasx.S @@ -0,0 +1,500 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + xvreplgr2vr.d VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + xvreplgr2vr.w VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 +#else + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S new file mode 100644 index 000000000..241d3d16e --- /dev/null +++ b/kernel/loongarch64/cscal_lsx.S @@ -0,0 +1,445 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 +#endif + vxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + vst VXZ, X, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, X, 6 * SIZE +#else + vst VXZ, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 +#else + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S new file mode 100644 index 000000000..3e65f2c15 --- /dev/null +++ b/kernel/loongarch64/csum_lasx.S @@ -0,0 +1,274 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S new file mode 100644 index 000000000..8de8e27ca --- /dev/null +++ b/kernel/loongarch64/csum_lsx.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S new file mode 100644 index 000000000..d53773d5a --- /dev/null +++ b/kernel/loongarch64/cswap_lasx.S @@ -0,0 +1,394 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 4 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 6 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 4 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 6 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S new file mode 100644 index 000000000..62a869066 --- /dev/null +++ b/kernel/loongarch64/cswap_lsx.S @@ -0,0 +1,421 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S index 13faa977e..f8e26fda2 100644 --- a/kernel/loongarch64/dgemm_kernel_16x4.S +++ b/kernel/loongarch64/dgemm_kernel_16x4.S @@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +/********************************************************************* +* 2023/06/28 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2023/06/28 guxiwei +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 32 +* DGEMM_DEFAULT_Q 152 +* DGEMM_DEFAULT_R 858 +* A_PR1 1024 +* B_PR1 256 +* +* +* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: +* 1 thread: 36.0 GFLOPS +* 2 threads: 71.6 GFLOPS +* 3 threads: 101.5 GFLOPS +* 4 threads: 132.8 GFLOPS +*********************************************************************/ + /* Function parameters */ #define M $r4 // param 1: bm #define N $r5 // param 2: bn @@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define U4 $xr4 #define U5 $xr5 #define U6 $xr6 -#define D0 $xr7 -#define D1 $xr8 -#define D2 $xr9 -#define D3 $xr10 -#define D4 $xr11 -#define D5 $xr12 -#define D6 $xr13 -#define D7 $xr14 -#define D8 $xr15 -#define D9 $xr16 -#define D10 $xr17 -#define D11 $xr18 -#define D12 $xr19 -#define D13 $xr20 -#define D14 $xr21 -#define D15 $xr22 -#define VALPHA $xr23 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 +#define VALPHA $xr15 /* Prefetch interval */ -#define A_PRE 0x200 +#define A_PRE 0x400 #define B_PRE 0x100 - PROLOGUE - - addi.d $sp, $sp, -56 - /* Store regs */ - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 24 - SDARG $r27, $sp, 32 - ST $f23, $sp, 40 - ST ALPHA, $sp, 48 - - /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ - xvld VALPHA, $sp, 48 - xvreplve0.d VALPHA, VALPHA - -#if defined (TRMMKERNEL) && !defined(LEFT) - sub.d OFF, ZERO, OFFSET -#else - xor OFF, OFF, OFF -#endif - - /* if (!(N >> 2)) goto L_N3 */ - srai.d J, N, 2 /* J = bn >> 2 */ - andi N, N, 0x03 - beq ZERO, J, .L_N3 - -.L_J1: /* J-- && This loop include Condition 1 */ - -/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* -* dgemm_core_16x4 */ - move C0, C - move A0, A - slli.d T0, LDC, 3 - add.d C1, C0, T0 - addi.d J, J, -1 /* J-- */ - add.d C2, C1, T0 - add.d C3, C2, T0 - -#if defined(TRMMKERNEL) && defined(LEFT) - move OFF, OFFSET -#endif - - /* if (!(M >> 4)) goto L_M8 */ - srai.d I, M, 4 /* I = bm >> 4 */ - beq ZERO, I, .L_M8 - -.L_I1: /* I-- */ -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x07 - add.d A0, A0, T0 - slli.d T0, OFF, 0x05 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 16 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - /* Calculate the first set of D0~D15, - * avoidig set 0 operation - * Load 16 * 64 from A0 - * U0 = {a3, a2, a1, a0} - * U1 = {a7, a6, a5, a4} - * U2 = {a11, a10, a9, a8} - * U3 = {a15, a14, a13, a12} - */ +.macro KERNEL2x16x4 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - preld 0, C0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - preld 0, C0, 0x40 - xvfmul.d D2, U2, U4 - xvfmul.d D3, U3, U4 - - xvldrepl.d U4, B0, 0x08 - preld 0, C1, 0x00 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 - preld 0, C1, 0x40 - xvfmul.d D6, U2, U4 - xvfmul.d D7, U3, U4 - - xvldrepl.d U4, B0, 0x10 - preld 0, C2, 0x00 - /* line 3 */ - xvfmul.d D8, U0, U4 - xvfmul.d D9, U1, U4 - preld 0, C2, 0x40 - xvfmul.d D10, U2, U4 - xvfmul.d D11, U3, U4 - - xvldrepl.d U4, B0, 0x18 - preld 0, C3, 0x00 - /* line 4 */ - xvfmul.d D12, U0, U4 - xvfmul.d D13, U1, U4 - preld 0, C3, 0x40 - xvfmul.d D14, U2, U4 - xvfmul.d D15, U3, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_L7 */ - beq ZERO,TL, .L_L7 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - /* Calculate 8 sets of D0~D15 */ -.L_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + preld 0, B0, B_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 +.endm - /***8-3***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x16x4_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + preld 0, B0, B_PRE + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 +.macro KERNEL8x16x4 +.rept 4 + KERNEL2x16x4 +.endr +.endm - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 +.macro KERNEL8x16x4_END +.rept 3 + KERNEL2x16x4 +.endr + KERNEL2x16x4_END +.endm - /***8-5***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x8x4 xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 + xvldrepl.d U12, B0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 +.endm - /***8-7***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x8x4_END xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 +.endm - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm - /***8-8***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE + xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_TL1 + xvld U8, A0, 0x00 - /* Maybe we need calculate the last - * 7 sets of D0~D15? - */ -.L_L7: - /* if (!(L & 7)) goto L_L0 */ - andi TL, L, 7 - beq TL, ZERO,.L_L0 + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 -.L_L71: - /* Load 16 * 64 from A0 */ + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - addi.d TL, TL, -1 - blt ZERO,TL, .L_L71 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D12, U0, U7, D12 +.endm -.L_L0: -#if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA - xvfmul.d D1, D1, VALPHA - xvfmul.d D2, D2, VALPHA - xvfmul.d D3, D3, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D5, D5, VALPHA - xvfmul.d D6, D6, VALPHA - xvfmul.d D7, D7, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D9, D9, VALPHA - xvfmul.d D10, D10, VALPHA - xvfmul.d D11, D11, VALPHA - xvfmul.d D12, D12, VALPHA - xvfmul.d D13, D13, VALPHA - xvfmul.d D14, D14, VALPHA - xvfmul.d D15, D15, VALPHA -#else - /* Load C0 */ - xvld U0, C0, 0x00 - xvld U1, C0, 0x20 - xvld U2, C0, 0x40 - xvld U3, C0, 0x60 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvfmadd.d D1, D1, VALPHA, U1 - xvfmadd.d D2, D2, VALPHA, U2 - xvfmadd.d D3, D3, VALPHA, U3 +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm - /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvld U2, C1, 0x40 - xvld U3, C1, 0x60 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 - xvfmadd.d D6, D6, VALPHA, U2 - xvfmadd.d D7, D7, VALPHA, U3 +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm - /* Load C2 */ - xvld U0, C2, 0x00 - xvld U1, C2, 0x20 - xvld U2, C2, 0x40 - xvld U3, C2, 0x60 - xvfmadd.d D8, D8, VALPHA, U0 - xvfmadd.d D9, D9, VALPHA, U1 - xvfmadd.d D10, D10, VALPHA, U2 - xvfmadd.d D11, D11, VALPHA, U3 +.macro KERNEL2x2x4 + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 - /* Load C3 */ - xvld U0, C3, 0x00 - xvld U1, C3, 0x20 - xvld U2, C3, 0x40 - xvld U3, C3, 0x60 - xvfmadd.d D12, D12, VALPHA, U0 - xvfmadd.d D13, D13, VALPHA, U1 - xvfmadd.d D14, D14, VALPHA, U2 - xvfmadd.d D15, D15, VALPHA, U3 -#endif // #if defined(TRMMKERNEL) + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - /* Store C0 */ - xvst D0, C0, 0x00 - xvst D1, C0, 0x20 - xvst D2, C0, 0x40 - xvst D3, C0, 0x60 - /* Store C1 */ - xvst D4, C1, 0x00 - xvst D5, C1, 0x20 - xvst D6, C1, 0x40 - xvst D7, C1, 0x60 - /* Store C2 */ - xvst D8, C2, 0x00 - xvst D9, C2, 0x20 - xvst D10, C2, 0x40 - xvst D11, C2, 0x60 - /* Store C3 */ - xvst D12, C3, 0x00 - xvst D13, C3, 0x20 - xvst D14, C3, 0x40 - xvst D15, C3, 0x60 + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 - /* Add stride for C */ - addi.d C0, C0, 0x80 - addi.d C1, C1, 0x80 - addi.d C2, C2, 0x80 - addi.d C3, C3, 0x80 + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - sub.d L, K, OFF -#ifdef LEFT - /* number of values in A */ - addi.d L, L, -16 -#else - /* number of values in B */ - addi.d L, L, -4 -#endif - slli.d T0, L, 0x07 - add.d A0, A0, T0 - slli.d T0, L, 0x05 - add.d B0, B0, T0 -#endif + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 -#ifdef LEFT - addi.d OFF, OFF, 0x10 -#endif -#endif // #if defined(TRMMKERNEL) + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm - addi.d I, I, -1 /* I-- */ - blt ZERO,I, .L_I1 +.macro KERNEL2x2x4_END + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 -.L_M8: - /* We have done M & 16, considering M=8/4/2/1 */ - andi I, M, 15 - beq ZERO,I, .L_M0 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - andi I, M, 8 - beq ZERO,I, .L_M4 + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x06 - add.d A0, A0, T0 - slli.d T0, OFF, 0x05 - add.d B0, B, T0 -#endif + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 8 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif // #if defined(TRMMKERNEL) +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 +.macro KERNEL2x1x4 + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 - xvldrepl.d U4, B0, 0x10 - /* line 3 */ - xvfmul.d D8, U0, U4 - xvfmul.d D9, U1, U4 + xvldrepl.d U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvld U12, B0, 0x00 - xvldrepl.d U4, B0, 0x18 - /* line 4 */ - xvfmul.d D12, U0, U4 - xvfmul.d D13, U1, U4 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 +.endm - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M8_L7 */ - beq ZERO,TL, .L_M8_L7 +.macro KERNEL2x1x4_END + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 -.L_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x16x2 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm - /***8-2***/ +.macro KERNEL2x16x2_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 +.macro KERNEL8x16x2 +.rept 4 + KERNEL2x16x2 +.endr +.endm + +.macro KERNEL8x16x2_END +.rept 3 + KERNEL2x16x2 +.endr + KERNEL2x16x2_END +.endm - /***8-3***/ +.macro KERNEL2x8x2 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U9, A0, 0x20 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d B0, B0, 0x10 +.endm - /***8-4***/ +.macro KERNEL2x8x2_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 +.endm - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 + .endr + KERNEL2x8x2_END +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-5***/ +.macro KERNEL2x4x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U5, B0, 0x08 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 +.endm - /***8-6***/ +.macro KERNEL2x4x2_END xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U5, B0, 0x08 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm - /***8-7***/ +.macro KERNEL2x2x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL2x2x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm - /***8-8***/ +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL2x1x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M8_TL1 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 -.L_M8_L7: - /* if (!(L & 7)) goto L_M8_L0 */ - andi TL, L, 7 - beq TL, ZERO,.L_M8_L0 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm -.L_M8_L71: +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x16x1 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 +.endm - addi.d TL, TL, -1 - blt ZERO,TL, .L_M8_L71 +.macro KERNEL2x16x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 -.L_M8_L0: -#if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA - xvfmul.d D1, D1, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D5, D5, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D9, D9, VALPHA - xvfmul.d D12, D12, VALPHA - xvfmul.d D13, D13, VALPHA -#else - /* Load C0 */ - xvld U0, C0, 0x00 - xvld U1, C0, 0x20 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvfmadd.d D1, D1, VALPHA, U1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 - /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 - /* Load C2 */ - xvld U0, C2, 0x00 - xvld U1, C2, 0x20 - xvfmadd.d D8, D8, VALPHA, U0 - xvfmadd.d D9, D9, VALPHA, U1 + xvldrepl.d U4, B0, 0x00 - /* Load C3 */ - xvld U0, C3, 0x00 - xvld U1, C3, 0x20 - xvfmadd.d D12, D12, VALPHA, U0 - xvfmadd.d D13, D13, VALPHA, U1 -#endif // #if defined(TRMMKERNEL) + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 - /* Store C0 */ - xvst D0, C0, 0x00 - xvst D1, C0, 0x20 - /* Store C1 */ - xvst D4, C1, 0x00 - xvst D5, C1, 0x20 - /* Store C2 */ - xvst D8, C2, 0x00 - xvst D9, C2, 0x20 - /* Store C3 */ - xvst D12, C3, 0x00 - xvst D13, C3, 0x20 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 - /* Add stride for C */ - addi.d C0, C0, 0x40 - addi.d C1, C1, 0x40 - addi.d C2, C2, 0x40 - addi.d C3, C3, 0x40 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 +.endm -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - sub.d L, K, OFF -#ifdef LEFT - /* number of values in A */ - addi.d L, L, -8 -#else - /* number of values in B */ - addi.d L, L, -4 -#endif - slli.d T0, L, 0x06 - add.d A0, A0, T0 - slli.d T0, L, 0x05 - add.d B0, B0, T0 -#endif +.macro KERNEL8x16x1 +.rept 4 + KERNEL2x16x1 +.endr +.endm -#ifdef LEFT - /* number of values in A */ - addi.d OFF, OFF, 0x08 -#endif -#endif // #if defined(TRMMKERNEL) +.macro KERNEL8x16x1_END +.rept 3 + KERNEL2x16x1 +.endr + KERNEL2x16x1_END +.endm -/********LOOP (if(N >> 2 ) && (M & 8)) End************/ +.macro KERNEL2x8x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 -.L_M4: - andi I, M, 4 - beq ZERO,I, .L_M2 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x05 - add.d A0, A0, T0 - add.d B0, B, T0 -#endif + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvld U9, A0, 0x20 + xvldrepl.d U12, B0, 0x00 -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 4 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 +.endm - /* Load 4 * 64 from A0 */ +.macro KERNEL2x8x1_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm - xvldrepl.d U4, B0, 0x18 - /* line 4 */ - xvfmul.d D12, U0, U4 +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M4_L7 */ - beq ZERO,TL, .L_M4_L7 +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm -.L_M4_TL1: /* TL-- */ - /***8-1***/ +.macro KERNEL2x4x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d B0, B0, 0x08 +.endm - /***8-2***/ +.macro KERNEL2x4x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm - /***8-3***/ +.macro KERNEL2x2x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm - /***8-4***/ +.macro KERNEL2x2x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm - /***8-5***/ +.macro KERNEL2x1x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm - /***8-6***/ +.macro KERNEL2x1x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + PROLOGUE - /***8-7***/ - xvld U0, A0, 0x00 + addi.d $sp, $sp, -120 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA, $sp, 112 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + xvldrepl.d VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.L_J1: /* J-- && This loop include Condition 1 */ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 - /***8-8***/ +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + preld 0, C0, 0x40 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + preld 0, C1, 0x40 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + preld 0, C2, 0x40 + xvfmul.d D10, U2, U6 + xvfmul.d D11, U3, U6 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 + preld 0, C3, 0x40 + xvfmul.d D14, U2, U7 + xvfmul.d D15, U3, U7 - addi.d A0, A0, 0x20 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x16x4 addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M4_TL1 + blt ZERO,TL, .L_TL1 -.L_M4_L7: - /* if (!(L & 7)) goto L_M4_L0 */ +.L_TL1_END: + KERNEL8x16x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ andi TL, L, 7 - beq TL, ZERO,.L_M4_L0 + beq TL, ZERO,.L_L0 -.L_M4_L71: +.L_L71: + /* Load 16 * 64 from A0 */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 /* Add stride for A0, B0 */ - addi.d A0, A0, 0x20 + addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 addi.d TL, TL, -1 - blt ZERO,TL, .L_M4_L71 + blt ZERO,TL, .L_L71 -.L_M4_L0: +.L_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA #else /* Load C0 */ xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U4 + xvfmadd.d D5, D5, VALPHA, U5 + xvfmadd.d D6, D6, VALPHA, U6 + xvfmadd.d D7, D7, VALPHA, U7 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvld U8, C2, 0x00 + xvld U9, C2, 0x20 + xvld U10, C2, 0x40 + xvld U11, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U8 + xvfmadd.d D9, D9, VALPHA, U9 + xvfmadd.d D10, D10, VALPHA, U10 + xvfmadd.d D11, D11, VALPHA, U11 /* Load C3 */ xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 xvfmadd.d D12, D12, VALPHA, U0 -#endif // #if defined(TRMMKERNEL) + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) /* Store C0 */ xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 /* Store C1 */ xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 /* Store C2 */ xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 /* Store C3 */ xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 /* Add stride for C */ - addi.d C0, C0, 0x20 - addi.d C1, C1, 0x20 - addi.d C2, C2, 0x20 - addi.d C3, C3, 0x20 + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub.d L, K, OFF #ifdef LEFT - /* number of values in A */ - addi.d L, L, -4 + /* number of values in A */ + addi.d L, L, -16 #else /* number of values in B */ addi.d L, L, -4 #endif - slli.d T0, L, 0x05 + slli.d T0, L, 0x07 add.d A0, A0, T0 + slli.d T0, L, 0x05 add.d B0, B0, T0 #endif #ifdef LEFT - /* number of values in A */ - addi.d OFF, OFF, 0x04 + addi.d OFF, OFF, 0x10 #endif #endif // #if defined(TRMMKERNEL) -/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 -.L_M2: - andi I, M, 2 - beq ZERO,I, .L_M1 +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B0, B #else - slli.d T0, OFF, 0x04 + slli.d T0, OFF, 0x06 add.d A0, A0, T0 slli.d T0, OFF, 0x05 add.d B0, B, T0 @@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub.d L, K, OFF #elif defined(LEFT) /* number of values in A */ - addi.d L, OFF, 2 + addi.d L, OFF, 8 #else /* number of values in B */ addi.d L, OFF, 4 @@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else // #if !defined(TRMMKERNEL) move B0, B move L, K /* L = bk */ -#endif +#endif // #if defined(TRMMKERNEL) - /* Load 2 * 64 from A0 */ + /* Load 8 * 64 from A0 */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 xvldrepl.d U4, B0, 0x00 /* line 1 */ xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 /* line 4 */ - xvfmul.d D12, U0, U4 + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x10 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 /* Reduce L */ addi.d L, L, -1 srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M2_L7 */ - beq ZERO,TL, .L_M2_L7 - -.L_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + addi.d TL, TL, -1 - addi.d A0, A0, 0x10 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + beq ZERO, TL, .L_M8_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 +.L_M8_TL1: /* TL-- */ + KERNEL8x8x4 addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M2_TL1 + blt ZERO,TL, .L_M8_TL1 -.L_M2_L7: - /* if (!(L & 7)) goto L_M2_L0 */ +.L_M8_TL1_END: + KERNEL8x8x4_END + +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ andi TL, L, 7 - beq TL, ZERO,.L_M2_L0 + beq TL, ZERO,.L_M8_L0 -.L_M2_L71: +.L_M8_L71: xvld U0, A0, 0x00 + xvld U1, A0, 0x20 xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 /* Add stride for A0, B0 */ - addi.d A0, A0, 0x10 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 addi.d TL, TL, -1 - blt ZERO,TL, .L_M2_L71 + blt ZERO,TL, .L_M8_L71 -.L_M2_L0: +.L_M8_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA #else /* Load C0 */ xvld U0, C0, 0x00 + xvld U1, C0, 0x20 xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U2 + xvfmadd.d D5, D5, VALPHA, U3 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U4 + xvfmadd.d D9, D9, VALPHA, U5 /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U6 + xvfmadd.d D13, D13, VALPHA, U7 #endif // #if defined(TRMMKERNEL) - xvstelm.d D0, C0, 0x00, 0x00 - xvstelm.d D4, C1, 0x00, 0x00 - xvstelm.d D8, C2, 0x00, 0x00 - xvstelm.d D12, C3, 0x00, 0x00 - xvstelm.d D0, C0, 0x08, 0x01 - xvstelm.d D4, C1, 0x08, 0x01 - xvstelm.d D8, C2, 0x08, 0x01 - xvstelm.d D12, C3, 0x08, 0x01 + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 /* Add stride for C */ - addi.d C0, C0, 0x10 - addi.d C1, C1, 0x10 - addi.d C2, C2, 0x10 - addi.d C3, C3, 0x10 + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub.d L, K, OFF #ifdef LEFT /* number of values in A */ - addi.d L, L, -2 + addi.d L, L, -8 #else /* number of values in B */ addi.d L, L, -4 #endif - slli.d T0, L, 0x04 + slli.d T0, L, 0x06 add.d A0, A0, T0 slli.d T0, L, 0x05 add.d B0, B0, T0 @@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef LEFT /* number of values in A */ - addi.d OFF, OFF, 0x02 + addi.d OFF, OFF, 0x08 #endif #endif // #if defined(TRMMKERNEL) -/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ +/********LOOP (if(N >> 2 ) && (M & 8)) End************/ -.L_M1: - andi I, M, 1 - beq ZERO,I, .L_M0 +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B0, B #else - slli.d T0, OFF, 0x03 - add.d A0, A0, T0 slli.d T0, OFF, 0x05 + add.d A0, A0, T0 add.d B0, B, T0 #endif @@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub.d L, K, OFF #elif defined(LEFT) /* number of values in A */ - addi.d L, OFF, 1 + addi.d L, OFF, 4 #else /* number of values in B */ addi.d L, OFF, 4 @@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move L, K /* L = bk */ #endif - /* Load 1 * 64 from A0 */ + /* Load 4 * 64 from A0 */ xvld U0, A0, 0x00 xvldrepl.d U4, B0, 0x00 /* line 1 */ xvfmul.d D0, U0, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 + xvfmul.d D4, U0, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmul.d D8, U0, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 /* line 4 */ - xvfmul.d D12, U0, U4 + xvfmul.d D12, U0, U7 /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x08 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 /* Reduce L */ addi.d L, L, -1 srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M1_L7 */ - beq ZERO,TL, .L_M1_L7 + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 -.L_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + beq ZERO, TL, .L_M4_TL1_END - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 - /***8-2***/ +.L_M4_L71: xvld U0, A0, 0x00 xvldrepl.d U4, B0, 0x00 @@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x18 xvfmadd.d D12, U0, U4, D12 - addi.d A0, A0, 0x08 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - /***8-3***/ - xvld U0, A0, 0x00 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_M4_L0: + xvldrepl.d VALPHA, $sp, 112 +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* Load C1 */ + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + /* Load C2 */ + xvld U2, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U2 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + /* Load C3 */ + xvld U3, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 - /***8-4***/ - xvld U0, A0, 0x00 + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 - addi.d A0, A0, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvld U4, B0, 0x00 + + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 addi.d B0, B0, 0x20 - /***8-5***/ - xvld U0, A0, 0x00 + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.L_M2_L71: + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U4, B0, 0x00 - addi.d A0, A0, 0x08 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 addi.d B0, B0, 0x20 - /***8-6***/ - xvld U0, A0, 0x00 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_M2_L0: + xvldrepl.d VALPHA, $sp, 112 +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C1, 0x00, 0x01 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D0, C3, 0x00, 0x03 + xvstelm.d D1, C0, 0x08, 0x00 + xvstelm.d D1, C1, 0x08, 0x01 + xvstelm.d D1, C2, 0x08, 0x02 + xvstelm.d D1, C3, 0x08, 0x03 +#else + xvpackev.d D4, D1, D0 + xvpackod.d D5, D1, D0 + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvpermi.q U2, U0, 0x20 + xvpermi.q U3, U1, 0x20 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D0, D4, VALPHA, U2 + xvfmadd.d D1, D5, VALPHA, U3 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + vst $vr16, C0, 0x00 + vst $vr17, C1, 0x00 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D1, C3, 0x00, 0x02 + xvstelm.d D0, C2, 0x08, 0x03 + xvstelm.d D1, C3, 0x08, 0x03 +#endif // #if defined(TRMMKERNEL) - /***8-7***/ - xvld U0, A0, 0x00 + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif - /***8-8***/ - xvld U0, A0, 0x00 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmul.d D0, U0, U4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U8, A0, 0x00 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + addi.d TL, TL, -1 + xvld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END .L_M1_L7: /* if (!(L & 7)) goto L_M1_L0 */ @@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq TL, ZERO,.L_M1_L0 .L_M1_L71: - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 /* Add stride for A0, B0 */ addi.d A0, A0, 0x08 @@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO,TL, .L_M1_L71 .L_M1_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D12, D12, VALPHA + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C1, 0x00, 0x01 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D0, C3, 0x00, 0x03 #else /* Load C0 */ - xvld U0, C0, 0x00 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvldrepl.d U0, C0, 0x00 + xvfmadd.d D4, D0, VALPHA, U0 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvldrepl.d U1, C1, 0x00 + xvfmadd.d D5, D0, VALPHA, U1 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvldrepl.d U2, C2, 0x00 + xvfmadd.d D6, D0, VALPHA, U2 /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 -#endif // #if defined(TRMMKERNEL) + xvldrepl.d U3, C3, 0x00 + xvfmadd.d D7, D0, VALPHA, U3 - xvstelm.d D0, C0, 0x00, 0x00 - xvstelm.d D4, C1, 0x00, 0x00 - xvstelm.d D8, C2, 0x00, 0x00 - xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D4, C0, 0x00, 0x00 + xvstelm.d D5, C1, 0x00, 0x01 + xvstelm.d D6, C2, 0x00, 0x02 + xvstelm.d D7, C3, 0x00, 0x03 +#endif // #if defined(TRMMKERNEL) /* Add stride for C */ addi.d C0, C0, 0x08 @@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ///////////////////////////////////////////////// /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ + xvldrepl.d VALPHA, $sp, 112 + .L_N3: andi J, N, 2 beq ZERO, J, .L_N1 @@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d L, OFF, 2 #endif #else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 16 * 64 from A0 - * U0 = {a3, a2, a1, a0} - * U1 = {a7, a6, a5, a4} - * U2 = {a11, a10, a9, a8} - * U3 = {a15, a14, a13, a12} - */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - xvfmul.d D2, U2, U4 - xvfmul.d D3, U3, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 - xvfmul.d D6, U2, U4 - xvfmul.d D7, U3, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_L7 */ - beq ZERO,TL, .L_N3_L7 - -.L_N3_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ - /* Load 16 * 64 from A0 */ + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ xvld U0, A0, 0x00 xvld U1, A0, 0x20 xvld U2, A0, 0x40 xvld U3, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x80 addi.d B0, B0, 0x10 + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x16x2 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x16x2_END .L_N3_L7: /* if (!(L & 7)) goto L_N3_L0 */ @@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 /* Add stride for A0, B0 */ addi.d A0, A0, 0x80 addi.d B0, B0, 0x10 @@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D3, D3, VALPHA, U3 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvld U2, C1, 0x40 - xvld U3, C1, 0x60 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 - xvfmadd.d D6, D6, VALPHA, U2 - xvfmadd.d D7, D7, VALPHA, U3 + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U4 + xvfmadd.d D5, D5, VALPHA, U5 + xvfmadd.d D6, D6, VALPHA, U6 + xvfmadd.d D7, D7, VALPHA, U7 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmul.d D0, U0, U4 xvfmul.d D1, U1, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x40 @@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M8_L7 */ beq ZERO,TL, .L_N3_M8_L7 -.L_N3_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-7***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x40 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + beq ZERO, TL, .L_N3_M8_TL1_END - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 +.L_N3_M8_TL1: /* TL-- */ + KERNEL8x8x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M8_TL1 +.L_N3_M8_TL1_END: + KERNEL8x8x2_END .L_N3_M8_L7: /* if (!(L & 7)) goto L_N3_M8_L0 */ @@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 /* Add stride for A0, B0 */ addi.d A0, A0, 0x40 @@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D1, D1, VALPHA, U1 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U2 + xvfmadd.d D5, D5, VALPHA, U3 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d B0, B0, T0 #endif -#ifdef LEFT - addi.d OFF, OFF, 0x08 -#endif -#endif // #if defined(TRMMKERNEL) - -/********LOOP (if(N & 2) && (M & 8) ) End************/ - -.L_N3_M4: - andi I, M, 4 - beq ZERO,I, .L_N3_M2 - -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x05 - add.d A0, A0, T0 - slli.d T0, OFF, 0x04 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 4 -#else - /* number of values in B */ - addi.d L, OFF, 2 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 4 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_M4_L7 */ - beq ZERO,TL, .L_N3_M4_L7 - -.L_N3_M4_TL1: /* TL-- */ - /***8-1***/ - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) - /***8-5***/ - xvld U0, A0, 0x00 +/********LOOP (if(N & 2) && (M & 8) ) End************/ - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_N3_M4: + andi I, M, 4 + beq ZERO,I, .L_N3_M2 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-6***/ + /* Load 4 * 64 from A0 */ xvld U0, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 - /***8-7***/ - xvld U0, A0, 0x00 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x20 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + beq ZERO, TL, .L_N3_M4_TL1_END - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END .L_N3_M4_L7: /* if (!(L & 7)) goto L_N3_M4_L0 */ @@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x20 @@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M2_L7 */ beq ZERO,TL, .L_N3_M2_L7 -.L_N3_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 + xvld U8, A0, 0x00 - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x10 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + beq ZERO, TL, .L_N3_M2_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END .L_N3_M2_L7: /* if (!(L & 7)) goto L_N3_M2_L0 */ @@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x10 @@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 @@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else /* number of values in B */ addi.d L, OFF, 2 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_M1_L7 */ - beq ZERO,TL, .L_N3_M1_L7 - -.L_N3_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ + /* Load 1 * 64 from A0 */ xvld U0, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 - /***8-8***/ - xvld U0, A0, 0x00 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x08 addi.d B0, B0, 0x10 + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END .L_N3_M1_L7: /* if (!(L & 7)) goto L_N3_M1_L0 */ @@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x08 @@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 @@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_L7 */ beq ZERO,TL, .L_N1_L7 -.L_N1_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-7***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x80 addi.d B0, B0, 0x08 - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x16x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x16x1_END .L_N1_L7: /* if (!(L & 7)) goto L_N1_L0 */ @@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif slli.d T0, L, 0x07 add.d A0, A0, T0 - slli.d T0, L, 0x03 - add.d B0, B0, T0 -#endif - -#ifdef LEFT - addi.d OFF, OFF, 0x10 -#endif -#endif // #if defined(TRMMKERNEL) - - addi.d I, I, -1 /* I-- */ - blt ZERO,I, .L_N1_I1 - -.L_N1_M8: - /* We have done M & 16, considering M=8/4/2/1 */ - andi I, M, 15 - beq ZERO,I, .L_N1_M0 - - andi I, M, 8 - beq ZERO,I, .L_N1_M4 - -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x06 - add.d A0, A0, T0 - slli.d T0, OFF, 0x03 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 8 -#else - /* number of values in B */ - addi.d L, OFF, 1 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N1_M8_L7 */ - beq ZERO,TL, .L_N1_M8_L7 - -.L_N1_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + andi I, M, 8 + beq ZERO,I, .L_N1_M4 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ + /* Load 8 * 64 from A0 */ xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x40 addi.d B0, B0, 0x08 + beq ZERO, TL, .L_N1_M8_TL1_END +.L_N1_M8_TL1: /* TL-- */ + KERNEL8x8x1 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M8_TL1 +.L_N1_M8_TL1_END: + KERNEL8x8x1_END + .L_N1_M8_L7: /* if (!(L & 7)) goto L_N1_M8_L0 */ andi TL, L, 7 @@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M4_L7 */ beq ZERO,TL, .L_N1_M4_L7 -.L_N1_M4_TL1: /* TL-- */ - /***8-1***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x20 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M4_TL1_END - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END .L_N1_M4_L7: /* if (!(L & 7)) goto L_N1_M4_L0 */ @@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M2_L7 */ beq ZERO,TL, .L_N1_M2_L7 -.L_N1_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x10 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M2_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END .L_N1_M2_L7: /* if (!(L & 7)) goto L_N1_M2_L0 */ @@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M1_L7 */ beq ZERO,TL, .L_N1_M1_L7 -.L_N1_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x08 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M1_TL1_END - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END .L_N1_M1_L7: /* if (!(L & 7)) goto L_N1_M1_L0 */ @@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDARG $r26, $sp, 24 LDARG $r27, $sp, 32 LD $f23, $sp, 40 - addi.d $sp, $sp, 56 + LD $f24, $sp, 48 + LD $f25, $sp, 56 + LD $f26, $sp, 64 + LD $f27, $sp, 72 + LD $f28, $sp, 80 + LD $f29, $sp, 88 + LD $f30, $sp, 96 + LD $f31, $sp, 104 + addi.d $sp, $sp, 120 jirl $r0, $r1, 0x0 diff --git a/kernel/loongarch64/dgemm_kernel_8x4.S b/kernel/loongarch64/dgemm_kernel_8x4.S new file mode 100644 index 000000000..405f1bd97 --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_8x4.S @@ -0,0 +1,2894 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r26 +#define T2 $r27 +#define ZERO $r0 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define D14 $vr30 +#define D15 $vr31 +#define VALPHA $vr15 + +/* Prefetch interval */ +#define A_PRE 0x200 +#define B_PRE 0x100 + +.macro KERNEL2x8x4 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vldrepl.d U13, B0, 0x08 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vldrepl.d U14, B0, 0x10 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U15, B0, 0x18 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x8x4_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 +.endm + +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm + +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U13, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U14, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U15, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 +.endm + +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm + +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm + +.macro KERNEL2x2x4 + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x4_END + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 +.endm + +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm + +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm + +.macro KERNEL2x1x4 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vldrepl.d U8, A0, 0x08 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vld U12, B0, 0x20 + vld U13, B0, 0x30 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x40 +.endm + +.macro KERNEL2x1x4_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x8x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 +.endm + +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm + +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 +.endr + KERNEL2x8x2_END +.endm + +.macro KERNEL2x4x2 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x2_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 +.endm + +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm + +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm + +.macro KERNEL2x2x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x10 + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + vldrepl.d U8, A0, 0x08 + vld U12, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x1x2_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x8x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vldrepl.d U4, B0, 0x00 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vld U8, A0, 0x40 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x50 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x60 + vld U11, A0, 0x70 + + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 +.endm + +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm + +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm + +.macro KERNEL2x4x1 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x4x1_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm + +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm + +.macro KERNEL2x2x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x2x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm + +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm + +.macro KERNEL2x1x1 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vldrepl.d U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x1x1_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm + +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm + + + PROLOGUE + + addi.d $sp, $sp, -112 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + ST $f30, $sp, 88 + ST $f31, $sp, 96 + ST ALPHA, $sp, 104 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + vldrepl.d VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + preld 0, C0, 0x20 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + preld 0, C1, 0x20 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + vldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + preld 0, C2, 0x20 + vfmul.d D10, U2, U6 + vfmul.d D11, U3, U6 + + vldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + preld 0, C3, 0x20 + vfmul.d D14, U2, U7 + vfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x8x4 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + +.L_TL1_END: + KERNEL8x8x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + /* Cumulative D0~D15 */ + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D10, D10, VALPHA + vfmul.d D11, D11, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA + vfmul.d D14, D14, VALPHA + vfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 + + /* Load C2 */ + vld U8, C2, 0x00 + vld U9, C2, 0x10 + vld U10, C2, 0x20 + vld U11, C2, 0x30 + vfmadd.d D8, D8, VALPHA, U8 + vfmadd.d D9, D9, VALPHA, U9 + vfmadd.d D10, D10, VALPHA, U10 + vfmadd.d D11, D11, VALPHA, U11 + + /* Load C3 */ + vld U0, C3, 0x00 + vld U1, C3, 0x10 + vld U2, C3, 0x20 + vld U3, C3, 0x30 + vfmadd.d D12, D12, VALPHA, U0 + vfmadd.d D13, D13, VALPHA, U1 + vfmadd.d D14, D14, VALPHA, U2 + vfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + vst D10, C2, 0x20 + vst D11, C2, 0x30 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + vst D14, C3, 0x20 + vst D15, C3, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_M0 + + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + vldrepl.d U6, B0, 0x10 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + + vldrepl.d U7, B0, 0x18 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M4_TL1_END + +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 + + /* Load C2 */ + vld U4, C2, 0x00 + vld U5, C2, 0x10 + vfmadd.d D8, D8, VALPHA, U4 + vfmadd.d D9, D9, VALPHA, U5 + + /* Load C3 */ + vld U6, C3, 0x00 + vld U7, C3, 0x10 + vfmadd.d D12, D12, VALPHA, U6 + vfmadd.d D13, D13, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + vfmul.d D2, U1, U4 + vfmul.d D3, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 + vstelm.d D2, C0, 0x08, 0x00 + vstelm.d D2, C1, 0x08, 0x01 + vstelm.d D3, C2, 0x08, 0x00 + vstelm.d D3, C3, 0x08, 0x01 +#else + /* Load C0 */ + vld U0, C0, 0x00 + /* Load C1 */ + vld U1, C1, 0x00 + /* Load C2 */ + vld U2, C2, 0x00 + /* Load C3 */ + vld U3, C3, 0x00 + + vilvl.d D4, D2, D0 //C0 + vilvh.d D5, D2, D0 //C1 + vilvl.d D6, D3, D1 //C2 + vilvh.d D7, D3, D1 //C3 + + vfmadd.d D0, D4, VALPHA, U0 + vfmadd.d D2, D5, VALPHA, U1 + vfmadd.d D1, D6, VALPHA, U2 + vfmadd.d D3, D7, VALPHA, U3 + + vst D0, C0, 0x00 + vst D2, C1, 0x00 + vst D1, C2, 0x00 + vst D3, C3, 0x00 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vldrepl.d U1, C1, 0x00 + vilvl.d D4, U1, U0 + vfmadd.d D6, D0, VALPHA, D4 + + vldrepl.d U2, C2, 0x00 + vldrepl.d U3, C3, 0x00 + vilvl.d D5, U3, U2 + vfmadd.d D7, D1, VALPHA, D5 + + vstelm.d D6, C0, 0x00, 0x00 + vstelm.d D6, C1, 0x00, 0x01 + vstelm.d D7, C2, 0x00, 0x00 + vstelm.d D7, C3, 0x00, 0x01 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + + vldrepl.d VALPHA, $sp, 104 + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N3_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x8x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x8x2_END + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 8, considering M=4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N3_M0 + + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M4_TL1_END + +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + vldrepl.d U4, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M2_TL1_END + +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + vfmadd.d D0, U0, U4, D0 + + vfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + vld U1, C1, 0x00 + vfmadd.d D4, D4, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + vst D0, C0, 0x00 + vst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C1, 0x00 + vilvl.d U2, U1, U0 + vfmadd.d D0, D0, VALPHA, U2 +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N1_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a3, a2} + * U1 = {a1, a0} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x8x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x8x1_END + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N1_M0 + + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M4_TL1_END + +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M2_TL1_END + +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END + +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f24, $sp, 40 + LD $f25, $sp, 48 + LD $f26, $sp, 56 + LD $f27, $sp, 64 + LD $f28, $sp, 72 + LD $f29, $sp, 80 + LD $f30, $sp, 88 + LD $f31, $sp, 96 + addi.d $sp, $sp, 112 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..048a49af6 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S @@ -0,0 +1,185 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 +.L_I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_2I3: + andi I, M, 0x01 + beq ZERO, I, .L_N1 +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..30bebe8df --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -0,0 +1,283 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + push_if_used 26, 32 + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x03 + beq J, ZERO, .L_N4 +.L_J1: + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_I7 +.L_I1: + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ + U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ + U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ + U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ + U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 +.L_4I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..134066471 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -0,0 +1,280 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 18, 8 + + move S0, SRC + move P0, DST + + // Find P0, P2, P3 + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + pop_if_used 18, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..a7e3ef69c --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -0,0 +1,597 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x03 + srai.d T1, N, 0x02 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x02 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x01 + slli.d T0, T0, 0x01 + mul.d P4, M, T0 + slli.d P4, P4, 0x03 + add.d P4, DST, P4 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x06 + beq ZERO, J, .L_M7 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x03 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S5, 0x20 + vld U3, S5, 0x30 + vld U4, S6, 0x00 + vld U5, S6, 0x10 + vld U6, S6, 0x20 + vld U7, S6, 0x30 + + vst U0, P1, 0x100 + vst U1, P1, 0x110 + vst U2, P1, 0x120 + vst U3, P1, 0x130 + vst U4, P1, 0x140 + vst U5, P1, 0x150 + vst U6, P1, 0x160 + vst U7, P1, 0x170 + + vld U0, S7, 0x00 + vld U1, S7, 0x10 + vld U2, S7, 0x20 + vld U3, S7, 0x30 + vld U4, S8, 0x00 + vld U5, S8, 0x10 + vld U6, S8, 0x20 + vld U7, S8, 0x30 + + vst U0, P1, 0x180 + vst U1, P1, 0x190 + vst U2, P1, 0x1a0 + vst U3, P1, 0x1b0 + vst U4, P1, 0x1c0 + vst U5, P1, 0x1d0 + vst U6, P1, 0x1e0 + vst U7, P1, 0x1f0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S6, 0x00 + vld U3, S6, 0x10 + vld U4, S7, 0x00 + vld U5, S7, 0x10 + vld U6, S8, 0x00 + vld U7, S8, 0x10 + + vst U0, P2, 0x80 + vst U1, P2, 0x90 + vst U2, P2, 0xa0 + vst U3, P2, 0xb0 + vst U4, P2, 0xc0 + vst U5, P2, 0xd0 + vst U6, P2, 0xe0 + vst U7, P2, 0xf0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + vld U4, S5, 0x00 + vld U5, S6, 0x00 + vld U6, S7, 0x00 + vld U7, S8, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + vst U4, P3, 0x40 + vst U5, P3, 0x50 + vst U6, P3, 0x60 + vst U7, P3, 0x70 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x03 + beq ZERO, I, .L_4N7 +.L_4I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x03 + beq ZERO, I, .L_2N7 +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x03 + beq ZERO, I, .L_1N7 +.L_1I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + vld U0, S1, 0x00 + vst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S new file mode 100644 index 000000000..a49bf9bb1 --- /dev/null +++ b/kernel/loongarch64/dgemv_n_8_lasx.S @@ -0,0 +1,554 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/14 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 + +.macro DLOAD_X_8 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ + X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro DLOAD_X_4 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro DLOAD_X_2 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro DLOAD_X_1 + GLDREPL xv, d, X0, X, 0x00 + GMUL xvf, d, X0, X0, VALPHA +.endm + +.macro DLOAD_Y_8 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro DLOAD_Y_4 + GLD xv, , Y0, Y, 0 +.endm + +.macro DLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro DSTORE_Y_8 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro DSTORE_Y_4 + GST xv, , Y0, Y, 0 +.endm + +.macro DSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +// Unable to use vector load/store ins +.macro DLOAD_Y_8_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.d $f18, T0, 0 + fldx.d $f19, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 +.endm + +.macro DLOAD_Y_4_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 +.endm + +.macro DSTORE_Y_8_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 3 +.endm + +.macro DSTORE_Y_4_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 +.endm + +.macro DLOAD_X_8_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X7, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro DLOAD_X_4_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro DLOAD_X_2_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro DGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ + Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ + Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ + Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ + Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ + Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ + Y0, A14, X7, Y0, Y1, A15, X7, Y1 +.endm + +.macro DGEMV_N_4x8 + GLD_INC xv, , 0x20, A0, PA0, 0, \ + A2, PA1, 0, \ + A4, PA2, 0, \ + A6, PA3, 0, \ + A8, PA4, 0, \ + A10, PA5, 0, \ + A12, PA6, 0, \ + A14, PA7, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, \ + Y0, A2, X1, Y0, \ + Y0, A4, X2, Y0, \ + Y0, A6, X3, Y0, \ + Y0, A8, X4, Y0, \ + Y0, A10, X5, Y0, \ + Y0, A12, X6, Y0, \ + Y0, A14, X7, Y0 +.endm + +.macro DGEMV_N_1x8 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ + $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 + GMADD f, d, $f10, $f12, $f2, $f10, \ + $f10, $f14, $f3, $f10, \ + $f10, $f16, $f4, $f10, \ + $f10, $f18, $f5, $f10, \ + $f10, $f20, $f6, $f10, \ + $f10, $f22, $f7, $f10, \ + $f10, $f24, $f8, $f10, \ + $f10, $f26, $f9, $f10, +.endm + +.macro DGEMV_N_8x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ + Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ + Y0, A6, X3, Y0, Y1, A7, X3, Y1 +.endm + +.macro DGEMV_N_4x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ + Y0, A4, X2, Y0, Y0, A6, X3, Y0 +.endm + +.macro DGEMV_N_1x4 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 + GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ + $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 +.endm + +.macro DGEMV_N_8x2 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1 +.endm + +.macro DGEMV_N_4x2 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 + GMADD xvf, d, Y0, A0, X0, Y0, \ + Y0, A2, X1, Y0 +.endm + +.macro DGEMV_N_1x2 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 + GMADD f, d, $f10, $f12, $f2, $f10, \ + $f10, $f14, $f3, $f10 +.endm + +.macro DGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + fmadd.d $f10, $f12, $f2, $f10 +.endm + +.macro DGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + DLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x8 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_M_3 + DLOAD_\Y_4 + DGEMV_N_4x8 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x8 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + DLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x4 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_4_M_3 + DLOAD_\Y_4 + DGEMV_N_4x4 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_N_4_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x4 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + DLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x2 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_2_M_3 + DLOAD_\Y_4 + DGEMV_N_4x2 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_N_2_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x2 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD PA1, PA1, K_LDA + PTR_ALSL X, INC_X, X, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + DLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x1 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 24 + 4 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + xvreplve0.d VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + DGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + DGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + DGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 24 + 4 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemv_n_lsx.S b/kernel/loongarch64/dgemv_n_lsx.S new file mode 100644 index 000000000..9a0141fb1 --- /dev/null +++ b/kernel/loongarch64/dgemv_n_lsx.S @@ -0,0 +1,229 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IX, $r0 + + move AO1, A //a_ptr + move XX, X + move YY, Y + + beq J, M, .L999 + +.L01: + vldx U0, XX, IX + vshuf4i.d U0, U0, 0x00 + + vfmul.d U1, VALPHA, U0 //temp1 + + move IY, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, M, 2 //n/4 + beq I, T0, .L03 + +.L02: + vldx U2, AO1, II + addi.d II, II, 16 + vldx U7, AO1, II + + move T1, IY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + fldx.d a3, YY, T3 + fldx.d a4, YY, T4 + + vextrins.d U3, U4, 0x10 + vextrins.d U5, U6, 0x10 + + vfmadd.d U3, U1, U2, U3 + vfmadd.d U5, U1, U7, U5 + + vextrins.d U4, U3, 0x01 + vextrins.d U6, U5, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + fstx.d a3, YY, T3 + fstx.d a4, YY, T4 + + add.d IY, T4, INCY + addi.d II, II, 16 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: + andi T0, M, 2 + beq $r0, T0, .L04 + + addi.d T1, $r0, 4 + mod.d T1, M, T1 + sub.d II, M, T1 + slli.d II, II, BASE_SHIFT + + move T1, IY + add.d T2, T1, INCY + + vldx U2, AO1, II + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + + vextrins.d U3, U4, 0x10 + + vfmadd.d U3, U1, U2, U3 + + vextrins.d U4, U3, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + + add.d IY, T2, INCY + +.L04: + andi T0, M, 1 + beq $r0, T0, .L05 + + addi.d II, M, -1 + slli.d II, II, BASE_SHIFT + + fldx.d a1, AO1, II + fldx.d a3, YY, IY + + fmadd.d a3, $f12, a1, a3 + + fstx.d a3, YY, IY + + add.d IY, IY, INCY + +.L05: + add.d AO1, AO1, LDA + add.d IX, IX, INCX + + addi.d J, J, 1 + blt J, N, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LD ALPHA, $sp, 72 + addi.d $sp, $sp, 80 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S new file mode 100644 index 000000000..71f942b0f --- /dev/null +++ b/kernel/loongarch64/dgemv_t_8_lasx.S @@ -0,0 +1,481 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/17 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y2 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro DLOAD_X8 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro DLOAD_X4 + GLD xv, , X0, X, 0x00 +.endm + +.macro DLOAD_X8_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.d $f2, T0, 0x00 + fldx.d $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.d $f4, T0, 0x00 + fldx.d $f5, T0, INC_X + GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 +.endm + +.macro DLOAD_X4_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 +.endm + +.macro DGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ + TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ + TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ + TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ + TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ + TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ + TP7, A14, X0, TP7, TP7, A15, X1, TP7 +.endm + +.macro DGEMV_T_8x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ + A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ + TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ + TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ + TP6, A12, X0, TP6, TP7, A14, X0, TP7, +.endm + +.macro DGEMV_T_4x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ + TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ + TP3, A6, X0, TP3, TP3, A7, X1, TP3 +.endm + +.macro DGEMV_T_4x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ + TP2, A4, X0, TP2, TP3, A6, X0, TP3 +.endm + +.macro DGEMV_T_2x8 + GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1 +.endm + +.macro DGEMV_T_2x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 +.endm + +.macro DGEMV_T_LASX XW:req X8:req, X4:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + DLOAD_\X8 + DGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_M_3 + DLOAD_\X4 + DGEMV_T_8x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 + fld.d $f15, PA4, 0x00 + fld.d $f16, PA5, 0x00 + fld.d $f17, PA6, 0x00 + fld.d $f18, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#endif + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ + $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.d $f15, PY1, 0x00 + fldx.d $f16, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.d $f17, PY2, 0x00 + fldx.d $f18, PY2, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ + $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + fst.d $f15, PY1, 0x00 + fstx.d $f16, PY1, INC_Y + fst.d $f17, PY2, 0x00 + fstx.d $f18, PY2, INC_Y + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + DLOAD_\X8 + DGEMV_T_4x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_4_M_3 + DLOAD_\X4 + DGEMV_T_4x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_4_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + fld.d $f1, X, 0x00 + GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 + + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + DLOAD_\X8 + DGEMV_T_2x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_2_M_3 + DLOAD_\X4 + DGEMV_T_2x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_2_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1 + andi I, M, 3 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + fld.d $f1, X, 0x00 + GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 + + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M8 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + fmadd.d $f19, $f3, $f1, $f19 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 + fld.d $f3, Y, 0x00 + fmadd.d $f3, ALPHA, $f19, $f3 + fst.d $f3, Y, 0x00 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 24 + 3 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + xvreplve0.d VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + DGEMV_T_LASX GAP_0, X8, X4 +.L_GAP_1: /* if (incx != 1) */ + DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP +.L_END: + pop_if_used 17 + 8, 24 + 3 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_lsx.S b/kernel/loongarch64/dgemv_t_lsx.S new file mode 100644 index 000000000..76f0d9bdc --- /dev/null +++ b/kernel/loongarch64/dgemv_t_lsx.S @@ -0,0 +1,279 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define AO3 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IY, $r0 + + move AO1, A //a_ptr1 + + srai.d T0, N, 2 //n/4 + beq J, T0, .L04 + +.L01: /* j> 2)) goto L_N3 */ + PTR_SRAI J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 +.align 5 +.L_J1: + PTR_ADDI J, J, -1 + PTR_ADD KK, M, OFFSET + + andi I, M, 15 + beq ZERO, I, .L_M16 + andi I, M, 1 + beqz I, .L_M2 +.L_M1: + PTR_ADDI T0, M, -1 + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ + PTR_ADD CC, T0, C /* c + (m - 1) */ + + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 /* b + 4 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + dgemm_dsolve_1x4 + PTR_ADDI KK, KK, -1 +.L_M2: + andi I, M, 2 + beqz I, .L_M4 + PTR_SRLI T0, M, 1 + PTR_SLLI T0, T0, 1 + PTR_ADDI T0, T0, -2 + PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ + PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ + PTR_SLLI T1, KK, 4 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 /* b + 4 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + dgemm_dsolve_2x4 + PTR_ADDI KK, KK, -2 +.L_M4: + andi I, M, 4 + beqz I, .L_M8 + PTR_SRLI T0, M, 2 + PTR_SLLI T0, T0, 2 + PTR_ADDI T0, T0, -4 + PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ + PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ + PTR_SLLI T1, KK, 5 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 /* b + 4 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + dgemm_dsolve_4x4 + PTR_ADDI KK, KK, -4 +.L_M8: + andi I, M, 8 + beqz I, .L_M16 + PTR_SRLI T0, M, 3 + PTR_SLLI T0, T0, 3 + PTR_ADDI T0, T0, -8 + PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ + PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ + PTR_SLLI T1, KK, 6 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 /* b + 4 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + dgemm_dsolve_8x4 + PTR_ADDI KK, KK, -8 +.L_M16: + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M0 + + PTR_SRLI T0, M, 4 + PTR_SLLI T0, T0, 4 + PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, A, AA + PTR_ADD CC, C, T0 +.align 5 +.L_I1: + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_SLLI T0, KK, 7 + PTR_ADD A0, AA, T0 + dgemm_dsolve_16x4 + PTR_ADDI I, I, -1 + PTR_ADDI KK, KK, -16 + PTR_ADDI CC, CC, -(16 * 8) + PTR_SLLI T0, K, 7 + PTR_SUB AA, AA, T0 + blt ZERO, I, .L_I1 +.L_M0: + PTR_SLLI T0, K, 3 + PTR_ALSL B, T0, B, 2 // b += 4 * k; + PTR_ALSL C, LDC, C, 2 // c += 4 * ldc + blt ZERO, J, .L_J1 +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + + PTR_ADD KK, M, OFFSET + andi I, M, 15 + beq ZERO, I, .L_N3_M16 + andi I, M, 1 + beqz I, .L_N3_M2 +.L_N3_M1: + PTR_ADDI KK, KK, -1 + + PTR_ADDI T0, M, -1 + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ + PTR_ADD CC, T0, C /* c + (m - 1) */ + + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 /* b + 2 * kk */ + GADD , d, C0, CC, ZERO, C1, C0, LDC + // dgemm_dsolve_1x2 + GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0 + GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0 + GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8 +.L_N3_M2: + andi I, M, 2 + beqz I, .L_N3_M4 + PTR_SRLI T0, M, 1 + PTR_SLLI T0, T0, 1 + PTR_ADDI T0, T0, -2 + PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ + PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ + PTR_SLLI T1, KK, 4 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 /* b + 2 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC + dgemm_dsolve_2x2 + PTR_ADDI KK, KK, -2 +.L_N3_M4: + andi I, M, 4 + beqz I, .L_N3_M8 + PTR_SRLI T0, M, 2 + PTR_SLLI T0, T0, 2 + PTR_ADDI T0, T0, -4 + PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ + PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ + PTR_SLLI T1, KK, 5 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 /* b + 2 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC + dgemm_dsolve_4x2 + PTR_ADDI KK, KK, -4 +.L_N3_M8: + andi I, M, 8 + beqz I, .L_N3_M16 + PTR_SRLI T0, M, 3 + PTR_SLLI T0, T0, 3 + PTR_ADDI T0, T0, -8 + PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ + PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ + PTR_SLLI T1, KK, 6 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 /* b + 2 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC + dgemm_dsolve_8x2 + PTR_ADDI KK, KK, -8 +.L_N3_M16: + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N3_M0 + + PTR_SRLI T0, M, 4 + PTR_SLLI T0, T0, 4 + PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, A, AA + PTR_ADD CC, C, T0 +.align 5 +.L_N3_I1: + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_SLLI T0, KK, 7 + PTR_ADD A0, AA, T0 + dgemm_dsolve_16x2 + PTR_ADDI I, I, -1 + PTR_ADDI KK, KK, -16 + PTR_ADDI CC, CC, -(16 * 8) + PTR_SLLI T0, K, 7 + PTR_SUB AA, AA, T0 + blt ZERO, I, .L_N3_I1 +.L_N3_M0: + PTR_SLLI T0, K, 3 + PTR_ALSL B, T0, B, 1 // b += 2 * k; + PTR_ALSL C, LDC, C, 1 // c += 2 * ldc +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + + PTR_ADD KK, M, OFFSET + andi I, M, 15 + beq ZERO, I, .L_N1_M16 + andi I, M, 1 + beqz I, .L_N1_M2 +.L_N1_M1: + PTR_ADDI KK, KK, -1 + + PTR_ADDI T0, M, -1 + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ + PTR_ADD CC, T0, C /* c + (m - 1) */ + + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 /* b + kk */ + GADD , d, C0, CC, ZERO + // dgemm_dsolve_1x1 + GLD f, d, $f0, A0, 0, $f1, C0, 0 + GMUL f, d, $f1, $f1, $f0 + GST f, d, $f1, C0, 0, $f1, B0, 0 +.L_N1_M2: + andi I, M, 2 + beqz I, .L_N1_M4 + PTR_SRLI T0, M, 1 + PTR_SLLI T0, T0, 1 + PTR_ADDI T0, T0, -2 + PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ + PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ + PTR_SLLI T1, KK, 4 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 /* b + kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO + dgemm_dsolve_2x1 + PTR_ADDI KK, KK, -2 +.L_N1_M4: + andi I, M, 4 + beqz I, .L_N1_M8 + PTR_SRLI T0, M, 2 + PTR_SLLI T0, T0, 2 + PTR_ADDI T0, T0, -4 + PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ + PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ + PTR_SLLI T1, KK, 5 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 /* b + kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO + dgemm_dsolve_4x1 + PTR_ADDI KK, KK, -4 +.L_N1_M8: + andi I, M, 8 + beqz I, .L_N1_M16 + PTR_SRLI T0, M, 3 + PTR_SLLI T0, T0, 3 + PTR_ADDI T0, T0, -8 + PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ + PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ + PTR_SLLI T1, KK, 6 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 /* b + kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO + dgemm_dsolve_8x1 + PTR_ADDI KK, KK, -8 +.L_N1_M16: + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M0 + + PTR_SRLI T0, M, 4 + PTR_SLLI T0, T0, 4 + PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, A, AA + PTR_ADD CC, C, T0 +.align 5 +.L_N1_I1: + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO + PTR_SLLI T0, KK, 7 + PTR_ADD A0, AA, T0 + dgemm_dsolve_16x1 + PTR_ADDI I, I, -1 + PTR_ADDI KK, KK, -16 + PTR_ADDI CC, CC, -(16 * 8) + PTR_SLLI T0, K, 7 + PTR_SUB AA, AA, T0 + blt ZERO, I, .L_N1_I1 +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S new file mode 100644 index 000000000..0e2cacccf --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S @@ -0,0 +1,959 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/26 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + * FLOAT *c, BLASLONG ldc, BLASLONG offset) + */ + +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc +#define OFFSET $r11 // param 9: offset + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 +#define T1 $r26 +#define T2 $r27 +#define KK $r28 +#define AA $r29 +#define CC $r30 +#define BB B0 +#undef ZERO +#define ZERO $r0 + +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 +#define G0 D0 +#define G1 D1 +#define G2 D2 +#define G3 D3 +#define G4 D4 +#define G5 D5 +#define G6 D6 +#define G7 D7 +#define G8 D8 +#define G9 D9 +#define G10 D10 +#define G11 D11 +#define G12 D12 +#define G13 D13 +#define G14 D14 +#define G15 D15 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +#include "dtrsm_kernel_macro.S" + +.macro ldrepl_macro start, end, stride +// Load Ux (x = 0...15) +.if \start <= \end + GLDREPL xv, d, $xr\start, A0, \stride * 8 + ldrepl_macro %start + 1, \end, %stride + 1 +.endif +.endm +.macro nmsub_macro start0, end0, start1, reg +// Gx -= reg * Ux +.if \start0 <= \end0 + xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 + nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.endif +.endm +.macro B_st_macro start, end, stride, N +// Store Gx(x = 16...31) +.if \start <= \end +.if \N == 4 + xvst $xr\start, B0, \stride * 0x20 +.elseif \N == 2 + vst $vr\start, B0, \stride * 0x10 +.elseif \N == 1 + fst.d $f\start, B0, \stride * 0x08 +.endif + B_st_macro %start + 1, \end, %stride + 1, \N +.endif +.endm + +.macro dsolve_16 N +// The data layout of C (4x16) is as follows (store 4 data in each register): +// U0 U1 U2 U3 +// U4 U5 U6 U7 +// U8 U9 U10 U11 +// U12 U13 U14 U15 +// The first step is to transpose the result of C + GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1 + GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1 + GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7 + GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7 +// Now we have the following memory layout of C: +// 0 1 2 3 ... 15 +// 0 | | | | | | | +// 1 | G0 | G1 | G2 | G3 | ... | G15 | +// 2 | | | | | | | +// 3 | | | | | | | +// Next we are going to process matrix A with a size of 16x16, +// using only the upper triangular portion. The memory layout of +// matrix A is as follows, quite large. +//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// 51 52 53 54 55 56 57 58 59 60 61 62 63 +// 68 69 70 71 72 73 74 75 76 77 78 79 +// 85 86 87 88 89 90 91 92 93 94 95 +// 102 103 104 105 106 107 108 109 110 111 +// 119 120 121 122 123 124 125 126 127 +// 136 137 138 139 140 141 142 143 +// 153 154 155 156 157 158 159 +// 170 171 172 173 174 175 +// 187 188 189 190 191 +// 204 205 206 207 +// 221 222 223 +// 238 239 +// 255 +// Sequentially extract data from A in row order +// Load 0 + ldrepl_macro 0, 15, 0 + GMUL xvf, d, G0, G0, U0 + nmsub_macro 17, 31, 1, G0 + PTR_ADDI A0, A0, 17 * 8 +// Load 1 + ldrepl_macro 1, 15, 0 + GMUL xvf, d, G1, G1, U1 + nmsub_macro 18, 31, 2, G1 + PTR_ADDI A0, A0, 17 * 8 +// Load 2 + ldrepl_macro 2, 15, 0 + GMUL xvf, d, G2, G2, U2 + nmsub_macro 19, 31, 3, G2 + PTR_ADDI A0, A0, 17 * 8 +// Load 3 + ldrepl_macro 3, 15, 0 + GMUL xvf, d, G3, G3, U3 + nmsub_macro 20, 31, 4, G3 + PTR_ADDI A0, A0, 17 * 8 +// Load 4 + ldrepl_macro 4, 15, 0 + GMUL xvf, d, G4, G4, U4 + nmsub_macro 21, 31, 5, G4 + PTR_ADDI A0, A0, 17 * 8 +// Load 5 + ldrepl_macro 5, 15, 0 + GMUL xvf, d, G5, G5, U5 + nmsub_macro 22, 31, 6, G5 + PTR_ADDI A0, A0, 17 * 8 +// Load 6 + ldrepl_macro 6, 15, 0 + GMUL xvf, d, G6, G6, U6 + nmsub_macro 23, 31, 7, G6 + PTR_ADDI A0, A0, 17 * 8 +// Load 7 + ldrepl_macro 7, 15, 0 + GMUL xvf, d, G7, G7, U7 + nmsub_macro 24, 31, 8, G7 + PTR_ADDI A0, A0, 17 * 8 +// Load 8 + ldrepl_macro 8, 15, 0 + GMUL xvf, d, G8, G8, U8 + nmsub_macro 25, 31, 9, G8 + PTR_ADDI A0, A0, 17 * 8 +// Load 9 + ldrepl_macro 9, 15, 0 + GMUL xvf, d, G9, G9, U9 + nmsub_macro 26, 31, 10, G9 + PTR_ADDI A0, A0, 17 * 8 +// Load 10 + ldrepl_macro 10, 15, 0 + GMUL xvf, d, G10, G10, U10 + nmsub_macro 27, 31, 11, G10 + PTR_ADDI A0, A0, 17 * 8 +// Load 11 + ldrepl_macro 11, 15, 0 + GMUL xvf, d, G11, G11, U11 + nmsub_macro 28, 31, 12, G11 + PTR_ADDI A0, A0, 17 * 8 +// Load 12 + ldrepl_macro 12, 15, 0 + GMUL xvf, d, G12, G12, U12 + nmsub_macro 29, 31, 13, G12 + PTR_ADDI A0, A0, 17 * 8 +// Load 13 + ldrepl_macro 13, 15, 0 + GMUL xvf, d, G13, G13, U13 + nmsub_macro 30, 31, 14, G13 + PTR_ADDI A0, A0, 17 * 8 +// Load 14 + ldrepl_macro 14, 15, 0 + GMUL xvf, d, G14, G14, U14 + nmsub_macro 31, 31, 15, G14 + PTR_ADDI A0, A0, 17 * 8 +// Load 15 + ldrepl_macro 15, 15, 0 + GMUL xvf, d, G15, G15, U15 +// Finally, We can store the result. +// For B, stored sequentially, and C, first transpose and then store + B_st_macro 16, 31, 0, \N + GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 + GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 + GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 + GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1 +.if \N == 4 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60, \ + G1, C1, 0x00, G5, C1, 0x20, G9, C1, 0x40, G13, C1, 0x60, \ + G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \ + G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60 +.elseif \N == 2 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60, \ + G1, C1, 0x00, G5, C1, 0x20, G9, C1, 0x40, G13, C1, 0x60 +.elseif \N == 1 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60 +.endif +.endm + +.macro dgemm_dsolve_16x4 + bge ZERO, KK, .L_dsolve_16x4_load + dgemm_16x4 + b .L_dsolve_16x4 +.L_dsolve_16x4_load: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +/********************** solver ******************/ +.L_dsolve_16x4: + dsolve_16 4 +.endm + +.macro dsolve_8 N +// The data layout of C (4x8) is as follows (store 4 data in each register): +// U0 U1 +// U2 U3 +// U4 U5 +// U6 U7 +// The first step is to transpose the result of C + GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9 + GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9 +// Now we have the following memory layout of C: +// 0 1 2 3 ... 7 +// 0 | | | | | | | +// 1 | G0 | G1 | G2 | G3 | ... | G7 | +// 2 | | | | | | | +// 3 | | | | | | | +// Next we are going to process matrix A with a size of 8x8, +// using only the upper triangular portion. The memory layout of +// matrix A is as follows: +//0 1 2 3 4 5 6 7 +// 9 10 11 12 13 14 15 +// 18 19 20 21 22 23 +// 27 28 29 30 31 +// 36 37 38 39 +// 45 46 47 +// 54 55 +// 63 +// Sequentially extract data from A in row order +// Load 0 + ldrepl_macro 0, 7, 0 + GMUL xvf, d, G0, G0, U0 + nmsub_macro 17, 23, 1, G0 + PTR_ADDI A0, A0, 9 * 8 +// Load 1 + ldrepl_macro 1, 7, 0 + GMUL xvf, d, G1, G1, U1 + nmsub_macro 18, 23, 2, G1 + PTR_ADDI A0, A0, 9 * 8 +// Load 2 + ldrepl_macro 2, 7, 0 + GMUL xvf, d, G2, G2, U2 + nmsub_macro 19, 23, 3, G2 + PTR_ADDI A0, A0, 9 * 8 +// Load 3 + ldrepl_macro 3, 7, 0 + GMUL xvf, d, G3, G3, U3 + nmsub_macro 20, 23, 4, G3 + PTR_ADDI A0, A0, 9 * 8 +// Load 4 + ldrepl_macro 4, 7, 0 + GMUL xvf, d, G4, G4, U4 + nmsub_macro 21, 23, 5, G4 + PTR_ADDI A0, A0, 9 * 8 +// Load 5 + ldrepl_macro 5, 7, 0 + GMUL xvf, d, G5, G5, U5 + nmsub_macro 22, 23, 6, G5 + PTR_ADDI A0, A0, 9 * 8 +// Load 6 + ldrepl_macro 6, 7, 0 + GMUL xvf, d, G6, G6, U6 + nmsub_macro 23, 23, 7, G6 + PTR_ADDI A0, A0, 9 * 8 +// Load 7 + ldrepl_macro 7, 7, 0 + GMUL xvf, d, G7, G7, U7 +// Finally, We can store the result. +// For B, stored sequentially, and C, first transpose and then store + B_st_macro 16, 23, 0, \N + GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 + GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 +.if \N == 4 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, \ + G1, C1, 0x00, G5, C1, 0x20, \ + G2, C2, 0x00, G6, C2, 0x20, \ + G3, C3, 0x00, G7, C3, 0x20 +.elseif \N == 2 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, \ + G1, C1, 0x00, G5, C1, 0x20 +.elseif \N == 1 + GST xv, , G0, C0, 0x00, G4, C0, 0x20 +.endif +.endm + +.macro dgemm_dsolve_8x4 + bge ZERO, L, .L_dsolve_8x4_load + dgemm_8x4 + b .L_dsolve_8x4 +.L_dsolve_8x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 +/********* solver *********/ +.L_dsolve_8x4: + dsolve_8 4 +.endm + +.macro dsolve_4 N +// The data layout of C (4x4) is as follows (store 4 data in each register): +// U0 +// U1 +// U2 +// U3 +// The first step is to transpose the result of C + GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5 +// Now we have the following memory layout of C: +// 0 1 2 3 +// 0 | | | | | +// 1 | G0 | G1 | G2 | G3 | +// 2 | | | | | +// 3 | | | | | +// Next we are going to process matrix A with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix A is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from A in row order +// Load 0 + ldrepl_macro 0, 3, 0 + GMUL xvf, d, G0, G0, U0 + nmsub_macro 17, 19, 1, G0 + PTR_ADDI A0, A0, 5 * 8 +// Load 1 + ldrepl_macro 1, 3, 0 + GMUL xvf, d, G1, G1, U1 + nmsub_macro 18, 19, 2, G1 + PTR_ADDI A0, A0, 5 * 8 +// Load 2 + ldrepl_macro 2, 3, 0 + GMUL xvf, d, G2, G2, U2 + nmsub_macro 19, 19, 3, G2 + PTR_ADDI A0, A0, 5 * 8 +// Load 3 + ldrepl_macro 3, 3, 0 + GMUL xvf, d, G3, G3, U3 +// Finally, We can store the result. +// For B, stored sequentially, and C, first transpose and then store + B_st_macro 16, 19, 0, \N + GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 +.if \N == 4 + GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 +.elseif \N == 2 + GST xv, , G0, C0, 0x00, G1, C1, 0x00 +.elseif \N == 1 + GST xv, , G0, C0, 0x00 +.endif +.endm + +.macro dgemm_dsolve_4x4 + bge ZERO, L, .L_dsolve_4x4_load + dgemm_4x4 + b .L_dsolve_4x4 +.L_dsolve_4x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/************** solver *****************/ +.L_dsolve_4x4: + dsolve_4 4 +.endm + +.macro dsolve_2 N +// Transpose + GSBUTTERFLY xv, d, G0, G1, U1, U0 +// Now we have the following memory layout of C: +// 0 1 +// 0 | | | +// 1 | G0 | G1 | +// 2 | | | +// 3 | | | +// Next we are going to process matrix A with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix A is as follows: +//0 1 +// 3 +// Sequentially extract data from A in row order +// Load 0 + ldrepl_macro 0, 1, 0 + GMUL xvf, d, G0, G0, U0 + nmsub_macro 17, 17, 1, G0 + PTR_ADDI A0, A0, 3 * 8 +// Load 1 + ldrepl_macro 1, 1, 0 + GMUL xvf, d, G1, G1, U1 +// Finally, We can store the result. +// For B, stored sequentially, and C, first transpose and then store + B_st_macro 16, 17, 0, \N + GSBUTTERFLY xv, d, U0, U1, G1, G0 +.if \N == 4 + vst $vr0, C0, 0x00 + vst $vr1, C1, 0x00 + xvstelm.d U0, C2, 0x00, 0x02 + xvstelm.d U1, C3, 0x00, 0x02 + xvstelm.d U0, C2, 0x08, 0x03 + xvstelm.d U1, C3, 0x08, 0x03 +.elseif \N == 2 + vst $vr0, C0, 0x00 + vst $vr1, C1, 0x00 +.elseif \N == 1 + vst $vr0, C0, 0x00 +.endif +.endm + +.macro dgemm_dsolve_2x4 + bge ZERO, L, .L_dsolve_2x4_load + dgemm_2x4 + b .L_dsolve_2x4 +.L_dsolve_2x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U1, U3, 0x02 +/********************** solver ******************/ +.L_dsolve_2x4: + dsolve_2 4 +.endm + +.macro dgemm_dsolve_1x4 + bge ZERO, L, .L_dsolve_1x4_load + dgemm_1x4 + b .L_dsolve_1x4 +.L_dsolve_1x4_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 + xvinsve0.d U0, U1, 0x01 + xvinsve0.d U0, U2, 0x02 + xvinsve0.d U0, U3, 0x03 +.L_dsolve_1x4: + GLDREPL xv, d, D0, A0, 0x00 + GMUL xvf, d, U0, U0, D0 + // Store C + xvstelm.d U0, C0, 0x00, 0x00 + xvstelm.d U0, C1, 0x00, 0x01 + xvstelm.d U0, C2, 0x00, 0x02 + xvstelm.d U0, C3, 0x00, 0x03 + // Store B + xvst U0, B0, 0x00 +.endm + +.macro dgemm_dsolve_16x2 + bge ZERO, L, .L_dsolve_16x2_load + dgemm_16x2 + b .L_dsolve_16x2 +.L_dsolve_16x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 +.L_dsolve_16x2: + dsolve_16 2 +.endm + +.macro dgemm_dsolve_8x2 + bge ZERO, L, .L_dsolve_8x2_load + dgemm_8x2 + b .L_dsolve_8x2 +.L_dsolve_8x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 +.L_dsolve_8x2: + dsolve_8 2 +.endm + +.macro dgemm_dsolve_4x2 + bge ZERO, L, .L_dsolve_4x2_load + dgemm_4x2 + b .L_dsolve_4x2 +.L_dsolve_4x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_4x2: + dsolve_4 2 +.endm + +.macro dgemm_dsolve_1x2 + bge ZERO, L, .L_dsolve_1x2_load + dgemm_1x2 + b .L_dsolve_1x2 +.L_dsolve_1x2_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + xvinsve0.d U0, U1, 0x01 +.L_dsolve_1x2: + GLDREPL xv, d, D0, A0, 0x00 + GMUL xvf, d, U0, U0, D0 + // Store C + xvstelm.d U0, C0, 0x00, 0x00 + xvstelm.d U0, C1, 0x00, 0x01 + // Store B + vst $vr0, B0, 0x00 +.endm + +.macro dgemm_dsolve_2x2 + bge ZERO, L, .L_dsolve_2x2_load + dgemm_2x2 + b .L_dsolve_2x2 +.L_dsolve_2x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_2x2: + dsolve_2 2 +.endm + +.macro dgemm_dsolve_16x1 + bge ZERO, L, .L_dsolve_16x1_load + dgemm_16x1 + b .L_dsolve_16x1 +.L_dsolve_16x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 +.L_dsolve_16x1: + dsolve_16 1 +.endm + +.macro dgemm_dsolve_8x1 + bge ZERO, L, .L_dsolve_8x1_load + dgemm_8x1 + b .L_dsolve_8x1 +.L_dsolve_8x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 +.L_dsolve_8x1: + dsolve_8 1 +.endm + +.macro dgemm_dsolve_4x1 + bge ZERO, L, .L_dsolve_4x1_load + dgemm_4x1 + b .L_dsolve_4x1 +.L_dsolve_4x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_4x1: + dsolve_4 1 +.endm + +.macro dgemm_dsolve_2x1 + bge ZERO, L, .L_dsolve_2x1_load + dgemm_2x1 + b .L_dsolve_2x1 +.L_dsolve_2x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_2x1: + dsolve_2 1 +.endm + +.macro dgemm_dsolve_1x1 + bge ZERO, L, .L_dsolve_1x1_load + dgemm_1x1 + b .L_dsolve_1x1 +.L_dsolve_1x1_load: + // Load C + fld.d $f0, C0, 0x00 +.L_dsolve_1x1: + GLDREPL xv, d, D0, A0, 0x00 + GMUL xvf, d, U0, U0, D0 + // Store C + xvstelm.d U0, C0, 0x00, 0x00 + // Store B + xvstelm.d U0, B0, 0x00, 0x00 +.endm + + PROLOGUE + push_if_used 26, 32 + PTR_SLLI LDC, LDC, 3 + /* if (!(N >> 2)) goto L_N3 */ + PTR_SRAI J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 +.align 5 +.L_J1: + PTR_ADDI J, J, -1 + move KK, OFFSET + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_M15 +.align 4 +.L_I1: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x4 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADDI KK, KK, 0x10 // kk += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_I1 +.L_M15: + andi I, M, 8 + beqz I, .L_M7 +.L_M8: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x4 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADDI KK, KK, 0x08 // kk += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_M7: + andi I, M, 4 + beqz I, .L_M3 +.L_M4: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x4 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADDI KK, KK, 0x04 // kk += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_M3: + andi I, M, 2 + beqz I, .L_M1 +.L_M2: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x4 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADDI KK, KK, 0x02 // kk += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_M1: + andi I, M, 1 + beqz I, .L_M0 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x4 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADDI KK, KK, 0x01 // kk += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_M0: + PTR_SLLI T0, K, 5 + PTR_SLLI T1, LDC, 2 + PTR_ADD B, B, T0 // b += 4 * k + PTR_ADD C, C, T1 // c += 4 * ldc + bnez J, .L_J1 +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 +.L_N2: + move KK, OFFSET + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N2_M15 +.align 4 +.L_N2_I1: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x2 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADDI KK, KK, 0x10 // kk += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N2_I1 +.L_N2_M15: + andi I, M, 8 + beqz I, .L_N2_M7 +.L_N2_M8: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x2 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADDI KK, KK, 0x08 // kk += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N2_M7: + andi I, M, 4 + beqz I, .L_N2_M3 +.L_N2_M4: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x2 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADDI KK, KK, 0x04 // kk += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N2_M3: + andi I, M, 2 + beqz I, .L_N2_M1 +.L_N2_M2: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x2 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADDI KK, KK, 0x02 // kk += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N2_M1: + andi I, M, 1 + beqz I, .L_N2_M0 + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x2 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADDI KK, KK, 0x01 // kk += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N2_M0: + PTR_SLLI T0, K, 4 + PTR_SLLI T1, LDC, 1 + PTR_ADD B, B, T0 // b += 2 * k + PTR_ADD C, C, T1 // c += 2 * ldc +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + + move KK, OFFSET + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N1_M15 +.align 4 +.L_N1_I1: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x1 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADDI KK, KK, 0x10 // kk += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N1_I1 +.L_N1_M15: + andi I, M, 8 + beqz I, .L_N1_M7 +.L_N1_M8: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x1 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADDI KK, KK, 0x08 // kk += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N1_M7: + andi I, M, 4 + beqz I, .L_N1_M3 +.L_N1_M4: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x1 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADDI KK, KK, 0x04 // kk += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N1_M3: + andi I, M, 2 + beqz I, .L_N1_M1 +.L_N1_M2: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x1 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADDI KK, KK, 0x02 // kk += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N1_M1: + andi I, M, 1 + beqz I, .L_N1_M0 + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x1 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADDI KK, KK, 0x01 // kk += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S new file mode 100644 index 000000000..421339736 --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S @@ -0,0 +1,882 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/09/26 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + * FLOAT *c, BLASLONG ldc, BLASLONG offset) + */ + +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc +#define OFFSET $r11 // param 9: offset + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 +#define T1 $r26 +#define T2 $r27 +#define KK $r28 +#define AA $r29 +#define CC $r30 +#define BB B0 +#undef ZERO +#define ZERO $r0 + +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 +#define G0 D0 +#define G1 D1 +#define G2 D2 +#define G3 D3 +#define G4 D4 +#define G5 D5 +#define G6 D6 +#define G7 D7 +#define G8 D8 +#define G9 D9 +#define G10 D10 +#define G11 D11 +#define G12 D12 +#define G13 D13 +#define G14 D14 +#define G15 D15 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +#include "dtrsm_kernel_macro.S" + +.macro ldrepl_macro start, end, stride +// Load Ux (x = 0...15) +.if \start <= \end + GLDREPL xv, d, $xr\start, B0, \stride * 8 + ldrepl_macro %start + 1, \end, %stride + 1 +.endif +.endm + +.macro nmsub_macro start0, end0, start1, reg +// Ux -= reg * Dx +.if \start0 <= \end0 + xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 + nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.endif +.endm + +.macro A_st_macro start, end, stride, N +// Store Ux(x = 0...15) +.if \start <= \end +.if \N == 4 + xvst $xr\start, A0, \stride * 0x20 +.elseif \N == 2 + vst $vr\start, A0, \stride * 0x10 +.elseif \N == 1 + fst.d $f\start, A0, \stride * 0x08 +.endif + A_st_macro %start + 1, \end, %stride + 1, \N +.endif +.endm + +.macro dsolve_16x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 + ldrepl_macro 20, 22, 5 + nmsub_macro 4, 7, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 + ldrepl_macro 25, 25, 15 + nmsub_macro 8, 11, 0, D2 + nmsub_macro 8, 11, 4, D5 + GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 + nmsub_macro 12, 15, 0, D3 + nmsub_macro 12, 15, 4, D6 + nmsub_macro 12, 15, 8, D8 + GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 +// Store A + A_st_macro 0, 15, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ + U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ + U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \ + U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +.endm + +.macro dsolve_16x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 + ldrepl_macro 18, 18, 3 + nmsub_macro 4, 7, 0, D1 + GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 +// Store A + A_st_macro 0, 7, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ + U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 +.endm + +.macro dsolve_8x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 + ldrepl_macro 20, 22, 5 + nmsub_macro 2, 3, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U2, D4, U2, U3, D4, U3 + ldrepl_macro 25, 25, 15 + nmsub_macro 4, 5, 0, D2 + nmsub_macro 4, 5, 2, D5 + GMUL xvf, d, U4, D7, U4, U5, D7, U5 + nmsub_macro 6, 7, 0, D3 + nmsub_macro 6, 7, 2, D6 + nmsub_macro 6, 7, 4, D8 + GMUL xvf, d, U6, D9, U6, U7, D9, U7 +// Store A + A_st_macro 0, 7, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ + U2, C1, 0x00, U3, C1, 0x20, \ + U4, C2, 0x00, U5, C2, 0x20, \ + U6, C3, 0x00, U7, C3, 0x20 +.endm + +.macro dsolve_8x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 + ldrepl_macro 18, 18, 3 + nmsub_macro 2, 3, 0, D1 + GMUL xvf, d, U2, D2, U2, U3, D2, U3 +// Store A + A_st_macro 0, 3, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ + U2, C1, 0x00, U3, C1, 0x20 +.endm + +.macro dsolve_4x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 20, 22, 5 + nmsub_macro 1, 1, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U1, D4, U1 + ldrepl_macro 25, 25, 15 + nmsub_macro 2, 2, 0, D2 + nmsub_macro 2, 2, 1, D5 + GMUL xvf, d, U2, D7, U2 + nmsub_macro 3, 3, 0, D3 + nmsub_macro 3, 3, 1, D6 + nmsub_macro 3, 3, 2, D8 + GMUL xvf, d, U3, D9, U3 +// Store A + A_st_macro 0, 3, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 +.endm + +.macro dsolve_4x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 18, 18, 3 + nmsub_macro 1, 1, 0, D1 + GMUL xvf, d, U1, D2, U1 +// Store A + A_st_macro 0, 1, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C1, 0x00 +.endm + +.macro dsolve_2x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 20, 22, 5 + nmsub_macro 1, 1, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U1, D4, U1 + + ldrepl_macro 25, 25, 15 + nmsub_macro 2, 2, 0, D2 + nmsub_macro 2, 2, 1, D5 + GMUL xvf, d, U2, D7, U2 + nmsub_macro 3, 3, 0, D3 + nmsub_macro 3, 3, 1, D6 + nmsub_macro 3, 3, 2, D8 + GMUL xvf, d, U3, D9, U3 +// Store A + A_st_macro 0, 3, 0, 2 +// Store C + GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, +.endm + +.macro dsolve_2x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 18, 18, 3 + nmsub_macro 1, 1, 0, D1 + GMUL xvf, d, U1, D2, U1 +// Store A + A_st_macro 0, 1, 0, 2 +// Store C + GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 +.endm + +.macro dsolve_1x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 20, 22, 5 + nmsub_macro 1, 1, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U1, D4, U1 + + ldrepl_macro 25, 25, 15 + nmsub_macro 2, 2, 0, D2 + nmsub_macro 2, 2, 1, D5 + GMUL xvf, d, U2, D7, U2 + nmsub_macro 3, 3, 0, D3 + nmsub_macro 3, 3, 1, D6 + nmsub_macro 3, 3, 2, D8 + GMUL xvf, d, U3, D9, U3 +// Store A + A_st_macro 0, 3, 0, 1 +// Store C + GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, +.endm + +.macro dsolve_1x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 18, 18, 3 + nmsub_macro 1, 1, 0, D1 + GMUL xvf, d, U1, D2, U1 +// Store A + A_st_macro 0, 1, 0, 1 +// Store C + GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 +.endm + +.macro dgemm_dsolve_16x4 + bge ZERO, L, .L_dsolve_16x4_load + dgemm_16x4 + b .L_dsolve_16x4 +.L_dsolve_16x4_load: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +/********************** solver ******************/ +.L_dsolve_16x4: + dsolve_16x4 +.endm + +.macro dgemm_dsolve_8x4 + bge ZERO, L, .L_dsolve_8x4_load + dgemm_8x4 + b .L_dsolve_8x4 +.L_dsolve_8x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 +/********* solver *********/ +.L_dsolve_8x4: + dsolve_8x4 +.endm + +.macro dgemm_dsolve_4x4 + bge ZERO, L, .L_dsolve_4x4_load + dgemm_4x4 + b .L_dsolve_4x4 +.L_dsolve_4x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/************** solver *****************/ +.L_dsolve_4x4: + dsolve_4x4 +.endm + +.macro dgemm_dsolve_2x4 + bge ZERO, L, .L_dsolve_2x4_load + dgemm_2x4 + xvpermi.q U2, U0, 0x01 + xvpermi.q U3, U1, 0x01 + b .L_dsolve_2x4 +.L_dsolve_2x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/********************** solver ******************/ +.L_dsolve_2x4: + dsolve_2x4 +.endm + +.macro dgemm_dsolve_1x4 + bge ZERO, L, .L_dsolve_1x4_load + dgemm_1x4 + xvpackod.d U1, U0, U0 + xvpermi.q U2, U0, 0x01 + xvpermi.q U3, U1, 0x01 + b .L_dsolve_1x4 +.L_dsolve_1x4_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 +.L_dsolve_1x4: + dsolve_1x4 +.endm + +.macro dgemm_dsolve_16x2 + bge ZERO, L, .L_dsolve_16x2_load + dgemm_16x2 + b .L_dsolve_16x2 +.L_dsolve_16x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 +.L_dsolve_16x2: + dsolve_16x2 +.endm + +.macro dgemm_dsolve_8x2 + bge ZERO, L, .L_dsolve_8x2_load + dgemm_8x2 + b .L_dsolve_8x2 +.L_dsolve_8x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 +.L_dsolve_8x2: + dsolve_8x2 +.endm + +.macro dgemm_dsolve_4x2 + bge ZERO, L, .L_dsolve_4x2_load + dgemm_4x2 + b .L_dsolve_4x2 +.L_dsolve_4x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_4x2: + dsolve_4x2 +.endm + +.macro dgemm_dsolve_2x2 + bge ZERO, L, .L_dsolve_2x2_load + dgemm_2x2 + b .L_dsolve_2x2 +.L_dsolve_2x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_2x2: + dsolve_2x2 +.endm + +.macro dgemm_dsolve_1x2 + bge ZERO, L, .L_dsolve_1x2_load + dgemm_1x2 + xvpackod.d U1, U0, U0 + b .L_dsolve_1x2 +.L_dsolve_1x2_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 +.L_dsolve_1x2: + dsolve_1x2 +.endm + +.macro dgemm_dsolve_16x1 + bge ZERO, L, .L_dsolve_16x1_load + dgemm_16x1 + b .L_dsolve_16x1 +.L_dsolve_16x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 +.L_dsolve_16x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 + // Store A + A_st_macro 0, 3, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 +.endm + +.macro dgemm_dsolve_8x1 + bge ZERO, L, .L_dsolve_8x1_load + dgemm_8x1 + b .L_dsolve_8x1 +.L_dsolve_8x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 +.L_dsolve_8x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 + // Store A + A_st_macro 0, 1, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00, U1, C0, 0x20 +.endm + +.macro dgemm_dsolve_4x1 + bge ZERO, L, .L_dsolve_4x1_load + dgemm_4x1 + b .L_dsolve_4x1 +.L_dsolve_4x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_4x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00 +.endm + +.macro dgemm_dsolve_2x1 + bge ZERO, L, .L_dsolve_2x1_load + dgemm_2x1 + b .L_dsolve_2x1 +.L_dsolve_2x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_2x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 2 + // Strore C + GST v, , $vr0, C0, 0x00 +.endm + +.macro dgemm_dsolve_1x1 + bge ZERO, L, .L_dsolve_1x1_load + dgemm_1x1 + b .L_dsolve_1x1 +.L_dsolve_1x1_load: + // Load C + fld.d $f0, C0, 0x00 +.L_dsolve_1x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 1 + // Strore C + GST f, d, $f0, C0, 0x00 +.endm + + PROLOGUE + push_if_used 26, 32 + PTR_SLLI LDC, LDC, 3 + PTR_SUB KK, ZERO, OFFSET + /* if (!(N >> 2)) goto L_N3 */ + PTR_SRAI J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 +.align 5 +.L_J1: + PTR_ADDI J, J, -1 + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_M15 +.align 4 +.L_I1: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x4 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_I1 +.L_M15: + andi I, M, 8 + beqz I, .L_M7 +.L_M8: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x4 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_M7: + andi I, M, 4 + beqz I, .L_M3 +.L_M4: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x4 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_M3: + andi I, M, 2 + beqz I, .L_M1 +.L_M2: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x4 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_M1: + andi I, M, 1 + beqz I, .L_M0 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x4 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_M0: + PTR_SLLI T0, K, 5 + PTR_SLLI T1, LDC, 2 + PTR_ADD B, B, T0 // b += 4 * k + PTR_ADD C, C, T1 // c += 4 * ldc + PTR_ADDI KK, KK, 4 // kk += 4 + bnez J, .L_J1 +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 +.L_N2: + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N2_M15 +.align 4 +.L_N2_I1: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x2 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N2_I1 +.L_N2_M15: + andi I, M, 8 + beqz I, .L_N2_M7 +.L_N2_M8: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x2 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N2_M7: + andi I, M, 4 + beqz I, .L_N2_M3 +.L_N2_M4: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x2 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N2_M3: + andi I, M, 2 + beqz I, .L_N2_M1 +.L_N2_M2: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x2 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N2_M1: + andi I, M, 1 + beqz I, .L_N2_M0 + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x2 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N2_M0: + PTR_SLLI T0, K, 4 + PTR_SLLI T1, LDC, 1 + PTR_ADD B, B, T0 // b += 2 * k + PTR_ADD C, C, T1 // c += 2 * ldc + PTR_ADDI KK, KK, 2 // kk += 2 +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N1_M15 +.align 4 +.L_N1_I1: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x1 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N1_I1 +.L_N1_M15: + andi I, M, 8 + beqz I, .L_N1_M7 +.L_N1_M8: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x1 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N1_M7: + andi I, M, 4 + beqz I, .L_N1_M3 +.L_N1_M4: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x1 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N1_M3: + andi I, M, 2 + beqz I, .L_N1_M1 +.L_N1_M2: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x1 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N1_M1: + andi I, M, 1 + beqz I, .L_N1_M0 + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x1 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S new file mode 100644 index 000000000..5f86d75b5 --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S @@ -0,0 +1,953 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/09/26 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + * FLOAT *c, BLASLONG ldc, BLASLONG offset) + */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc +#define OFFSET $r11 // param 9: offset + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 +#define T1 $r26 +#define T2 $r27 +#define KK $r28 +#define AA $r29 +#define CC $r30 +#define BB $r31 +#undef ZERO +#define ZERO $r0 + +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +#include "dtrsm_kernel_macro.S" + +.macro ldrepl_macro start, end, stride +// Load Ux (x = 0...15) +.if \start <= \end + GLDREPL xv, d, $xr\start, B0, \stride * 8 + ldrepl_macro %start + 1, \end, %stride + 1 +.endif +.endm + +.macro nmsub_macro start0, end0, start1, reg +// Ux -= reg * Dx +.if \start0 <= \end0 + xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 + nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.endif +.endm + +.macro A_st_macro start, end, stride, N +// Store Ux(x = 0...15) +.if \start <= \end +.if \N == 4 + xvst $xr\start, A0, \stride * 0x20 +.elseif \N == 2 + vst $vr\start, A0, \stride * 0x10 +.elseif \N == 1 + fst.d $f\start, A0, \stride * 0x08 +.endif + A_st_macro %start + 1, \end, %stride + 1, \N +.endif +.endm + +.macro dsolve_16x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 + nmsub_macro 0, 3, 4, D1 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 +// Store A + A_st_macro 0, 7, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ + U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 +.endm + +.macro dsolve_8x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U2, D2, U2, U3, D2, U3 + nmsub_macro 0, 1, 2, D1 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 +// Store A + A_st_macro 0, 3, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ + U2, C1, 0x00, U3, C1, 0x20 +.endm + +.macro dsolve_4x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 1, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C1, 0x00 +.endm + +.macro dsolve_2x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 1, 0, 2 +// Store C + GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 +.endm + +.macro dsolve_1x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 1, 0, 1 +// Store C + GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 +.endm + +.macro dsolve_16x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 + ldrepl_macro 19, 21, 8 + nmsub_macro 8, 11, 12, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 + ldrepl_macro 16, 16, 0 + nmsub_macro 4, 7, 12, D7 + nmsub_macro 4, 7, 8, D4 + GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 + nmsub_macro 0, 3, 12, D6 + nmsub_macro 0, 3, 8, D3 + nmsub_macro 0, 3, 4, D1 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 +// Store A + A_st_macro 0, 15, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ + U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ + U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \ + U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +.endm + +.macro dsolve_8x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U6, D9, U6, U7, D9, U7 + ldrepl_macro 19, 21, 8 + nmsub_macro 4, 5, 6, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U4, D5, U4, U5, D5, U5 + ldrepl_macro 16, 16, 0 + nmsub_macro 2, 3, 6, D7 + nmsub_macro 2, 3, 4, D4 + GMUL xvf, d, U2, D2, U2, U3, D2, U3 + nmsub_macro 0, 1, 6, D6 + nmsub_macro 0, 1, 4, D3 + nmsub_macro 0, 1, 2, D1 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 +// Store A + A_st_macro 0, 7, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ + U2, C1, 0x00, U3, C1, 0x20, \ + U4, C2, 0x00, U5, C2, 0x20, \ + U6, C3, 0x00, U7, C3, 0x20 +.endm + +.macro dsolve_4x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U3, D9, U3 + ldrepl_macro 19, 21, 8 + nmsub_macro 2, 2, 3, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U2, D5, U2 + ldrepl_macro 16, 16, 0 + nmsub_macro 1, 1, 3, D7 + nmsub_macro 1, 1, 2, D4 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 3, D6 + nmsub_macro 0, 0, 2, D3 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 3, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 +.endm + +.macro dsolve_2x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U3, D9, U3 + ldrepl_macro 19, 21, 8 + nmsub_macro 2, 2, 3, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U2, D5, U2 + ldrepl_macro 16, 16, 0 + nmsub_macro 1, 1, 3, D7 + nmsub_macro 1, 1, 2, D4 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 3, D6 + nmsub_macro 0, 0, 2, D3 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 3, 0, 2 +// Store C + GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 +.endm + +.macro dsolve_1x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U3, D9, U3 + ldrepl_macro 19, 21, 8 + nmsub_macro 2, 2, 3, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U2, D5, U2 + ldrepl_macro 16, 16, 0 + nmsub_macro 1, 1, 3, D7 + nmsub_macro 1, 1, 2, D4 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 3, D6 + nmsub_macro 0, 0, 2, D3 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 3, 0, 1 +// Store C + GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, +.endm + +.macro dgemm_dsolve_16x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x1_load + dgemm_16x1 + b .L_dsolve_16x1 +.L_dsolve_16x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 +.L_dsolve_16x1: + PTR_ADDI A0, T1, -16 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 + // Store A + A_st_macro 0, 3, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 +.endm + +.macro dgemm_dsolve_8x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x1_load + dgemm_8x1 + b .L_dsolve_8x1 +.L_dsolve_8x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 +.L_dsolve_8x1: + PTR_ADDI A0, T1, -8 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 + // Store A + A_st_macro 0, 1, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00, U1, C0, 0x20 +.endm + +.macro dgemm_dsolve_4x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x1_load + dgemm_4x1 + b .L_dsolve_4x1 +.L_dsolve_4x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_4x1: + PTR_ADDI A0, T1, -4 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00 +.endm + +.macro dgemm_dsolve_2x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x1_load + dgemm_2x1 + b .L_dsolve_2x1 +.L_dsolve_2x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_2x1: + PTR_ADDI A0, T1, -2 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 2 + // Strore C + GST v, , $vr0, C0, 0x00 +.endm + +.macro dgemm_dsolve_1x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_1x1_load + dgemm_1x1 + b .L_dsolve_1x1 +.L_dsolve_1x1_load: + // Load C + fld.d $f0, C0, 0x00 +.L_dsolve_1x1: + PTR_ADDI A0, T1, -1 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 1 + // Strore C + GST f, d, $f0, C0, 0x00 +.endm + +.macro dgemm_dsolve_16x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x2_load + dgemm_16x2 + b .L_dsolve_16x2 +.L_dsolve_16x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 +.L_dsolve_16x2: + PTR_ADDI A0, T1, -(16 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_16x2 +.endm + +.macro dgemm_dsolve_8x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x2_load + dgemm_8x2 + b .L_dsolve_8x2 +.L_dsolve_8x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 +.L_dsolve_8x2: + PTR_ADDI A0, T1, -(8 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_8x2 +.endm + +.macro dgemm_dsolve_4x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x2_load + dgemm_4x2 + b .L_dsolve_4x2 +.L_dsolve_4x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_4x2: + PTR_ADDI A0, T1, -(4 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_4x2 +.endm + +.macro dgemm_dsolve_2x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x2_load + dgemm_2x2 + b .L_dsolve_2x2 +.L_dsolve_2x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_2x2: + PTR_ADDI A0, T1, -(2 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_2x2 +.endm + +.macro dgemm_dsolve_1x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_1x2_load + dgemm_1x2 + xvpackod.d U1, U0, U0 + b .L_dsolve_1x2 +.L_dsolve_1x2_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 +.L_dsolve_1x2: + PTR_ADDI A0, T1, -(1 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_1x2 +.endm + +.macro dgemm_dsolve_16x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x4_load + dgemm_16x4 + b .L_dsolve_16x4 +.L_dsolve_16x4_load: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +/********************** solver ******************/ +.L_dsolve_16x4: + PTR_ADDI A0, T1, -(16 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_16x4 +.endm + +.macro dgemm_dsolve_8x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x4_load + dgemm_8x4 + b .L_dsolve_8x4 +.L_dsolve_8x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 +/********* solver *********/ +.L_dsolve_8x4: + PTR_ADDI A0, T1, -(8 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_8x4 +.endm + +.macro dgemm_dsolve_4x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x4_load + dgemm_4x4 + b .L_dsolve_4x4 +.L_dsolve_4x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/************** solver *****************/ +.L_dsolve_4x4: + PTR_ADDI A0, T1, -(4 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_4x4 +.endm + +.macro dgemm_dsolve_2x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x4_load + dgemm_2x4 + xvpermi.q U2, U0, 0x01 + xvpermi.q U3, U1, 0x01 + b .L_dsolve_2x4 +.L_dsolve_2x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/********************** solver ******************/ +.L_dsolve_2x4: + PTR_ADDI A0, T1, -(2 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_2x4 +.endm + +.macro dgemm_dsolve_1x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_1x4_load + dgemm_1x4 + xvpackod.d U1, U0, U0 + xvpermi.q U2, U0, 0x01 + xvpermi.q U3, U1, 0x01 + b .L_dsolve_1x4 +.L_dsolve_1x4_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 +.L_dsolve_1x4: + PTR_ADDI A0, T1, -(1 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_1x4 +.endm + + PROLOGUE + push_if_used 26, 32 + PTR_SLLI LDC, LDC, 3 + PTR_SUB KK, N, OFFSET + PTR_MUL T0, N, LDC + PTR_MUL T1, N, K + PTR_ADD C, C, T0 // c += n * ldc + PTR_SLLI T1, T1, 3 + PTR_ADD B, B, T1 + + andi J, N, 1 + beqz J, .L_N2 +.L_N1: + move AA, A + PTR_SUB C, C, LDC // c -= ldc + PTR_SLLI T0, K, 3 + PTR_SLLI T1, KK, 3 + PTR_SUB B, B, T0 // b -= k + PTR_ADD BB, B, T1 // bb = b + kk + move CC, C + + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N1_M15 +.align 4 +.L_N1_I1: + PTR_SLLI T1, KK, 7 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_16x1 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N1_I1 +.L_N1_M15: + andi I, M, 8 + beqz I, .L_N1_M7 +.L_N1_M8: + PTR_SLLI T1, KK, 6 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_8x1 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N1_M7: + andi I, M, 4 + beqz I, .L_N1_M3 +.L_N1_M4: + PTR_SLLI T1, KK, 5 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_4x1 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N1_M3: + andi I, M, 2 + beqz I, .L_N1_M1 +.L_N1_M2: + PTR_SLLI T1, KK, 4 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_2x1 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N1_M1: + andi I, M, 1 + beqz I, .L_N1_M0 + PTR_SLLI T1, KK, 3 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_1x1 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N1_M0: + PTR_ADDI KK, KK, -1 +.L_N2: + andi J, N, 2 + beq ZERO, J, .L_N4 + move AA, A + PTR_SLLI T0, LDC, 1 + PTR_SLLI T1, K, 4 + PTR_SLLI T2, KK, 4 + PTR_SUB B, B, T1 + PTR_SUB C, C, T0 + PTR_ADD BB, B, T2 + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N2_M15 +.align 4 +.L_N2_I1: + PTR_SLLI T1, KK, 7 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_16x2 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N2_I1 +.L_N2_M15: + andi I, M, 8 + beqz I, .L_N2_M7 +.L_N2_M8: + PTR_SLLI T1, KK, 6 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_8x2 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N2_M7: + andi I, M, 4 + beqz I, .L_N2_M3 +.L_N2_M4: + PTR_SLLI T1, KK, 5 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_4x2 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N2_M3: + andi I, M, 2 + beqz I, .L_N2_M1 +.L_N2_M2: + PTR_SLLI T1, KK, 4 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_2x2 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N2_M1: + andi I, M, 1 + beqz I, .L_N2_M0 + PTR_SLLI T1, KK, 3 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_1x2 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N2_M0: + PTR_ADDI KK, KK, -2 +.L_N4: + PTR_SRAI J, N, 2 /* J = bn >> 2 */ + beq ZERO, J, .L_N0 +.align 5 +.L_J1: + PTR_ADDI J, J, -1 + move AA, A + PTR_SLLI T0, LDC, 2 + PTR_SLLI T1, K, 5 + PTR_SLLI T2, KK, 5 + PTR_SUB B, B, T1 + PTR_SUB C, C, T0 + PTR_ADD BB, B, T2 + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_M15 +.align 4 +.L_I1: + PTR_SLLI T1, KK, 7 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_16x4 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_I1 +.L_M15: + andi I, M, 8 + beqz I, .L_M7 +.L_M8: + PTR_SLLI T1, KK, 6 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_8x4 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_M7: + andi I, M, 4 + beqz I, .L_M3 +.L_M4: + PTR_SLLI T1, KK, 5 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_4x4 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_M3: + andi I, M, 2 + beqz I, .L_M1 +.L_M2: + PTR_SLLI T1, KK, 4 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_2x4 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_M1: + andi I, M, 1 + beqz I, .L_M0 + PTR_SLLI T1, KK, 3 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_1x4 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_M0: + PTR_ADDI KK, KK, -4 + bnez J, .L_J1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_macro.S b/kernel/loongarch64/dtrsm_kernel_macro.S new file mode 100644 index 000000000..88b7121d1 --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_macro.S @@ -0,0 +1,2147 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/************** Dgemm Kernel 16x4 ****************/ +.macro KERNEL2x16x4 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x16x4_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 +.endm + +.macro KERNEL8x16x4 +.rept 4 + KERNEL2x16x4 +.endr +.endm + +.macro KERNEL8x16x4_END +.rept 3 + KERNEL2x16x4 +.endr + KERNEL2x16x4_END +.endm + +.macro KERNEL2x8x4 + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x8x4_END + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 +.endm + +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm + +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D12, U0, U7, D12 +.endm + +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm + +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm + +.macro KERNEL2x2x4 + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x4_END + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm + +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm + +.macro KERNEL2x1x4 + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvldrepl.d U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvld U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x1x4_END + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x16x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x16x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 +.endm + +.macro KERNEL8x16x2 +.rept 4 + KERNEL2x16x2 +.endr +.endm + +.macro KERNEL8x16x2_END +.rept 3 + KERNEL2x16x2 +.endr + KERNEL2x16x2_END +.endm + +.macro KERNEL2x8x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 +.endm + +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm + +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 + .endr + KERNEL2x8x2_END +.endm + +.macro KERNEL2x4x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x4x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm + +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm + +.macro KERNEL2x2x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x2x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x1x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x16x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x16x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 +.endm + +.macro KERNEL8x16x1 +.rept 4 + KERNEL2x16x1 +.endr +.endm + +.macro KERNEL8x16x1_END +.rept 3 + KERNEL2x16x1 +.endr + KERNEL2x16x1_END +.endm + +.macro KERNEL2x8x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvld U9, A0, 0x20 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x8x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm + +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm + +.macro KERNEL2x4x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x4x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm + +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm + +.macro KERNEL2x2x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x2x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm + +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm + +.macro KERNEL2x1x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x1x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm + +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm + +.macro dgemm_16x4 +.L_dgemm_16x4: // See dgemm_kernel_16x4.S + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 + + xvldrepl.d U6, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + xvfmul.d D10, U2, U6 + xvfmul.d D11, U3, U6 + + xvldrepl.d U7, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 + xvfmul.d D14, U2, U7 + xvfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_dgemm_16x4_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_16x4_TL1_END +.align 5 +.L_dgemm_16x4_TL1: + KERNEL8x16x4 + PTR_ADDI TL, TL, -1 + blt ZERO, TL, .L_dgemm_16x4_TL1 +.L_dgemm_16x4_TL1_END: + KERNEL8x16x4_END +.L_dgemm_16x4_L7: + andi TL, L, 7 + beq TL, ZERO, .L_dgemm_16x4_L0 +.align 5 +.L_dgemm_16x4_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 + + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_16x4_L71 +.L_dgemm_16x4_L0: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 + GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \ + U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7, \ + U8, U8, D8, U9, U9, D9, U10, U10, D10, U11, U11, D11, \ + U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15 +.endm + +.macro dgemm_1x4 +.L_dgemm_1x4: // See dgemm_kernel_16x4.S + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x08 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_dgemm_1x4_M1_L7 + xvldrepl.d U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + xvld U12, B0, 0x00 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_1x4_M1_TL1_END +.align 5 +.L_dgemm_1x4_M1_TL1: + KERNEL8x1x4 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_1x4_M1_TL1 +.L_dgemm_1x4_M1_TL1_END: + KERNEL8x1x4_END +.L_dgemm_1x4_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_1x4_M1_L0 +.align 5 +.L_dgemm_1x4_M1_L71: + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x08 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_1x4_M1_L71 +.L_dgemm_1x4_M1_L0: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 + xvinsve0.d U0, U1, 0x01 + xvinsve0.d U0, U2, 0x02 + xvinsve0.d U0, U3, 0x03 + GSUB xvf, d, U0, U0, D0 +.endm + +.macro dgemm_2x4 +.L_dgemm_2x4: + /* Load 2 * 64 from A0 */ + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + xvld U4, B0, 0x00 + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_dgemm_2x4_M2_L7 + + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + PTR_ADDI TL, TL, -1 + + xvld U12, B0, 0x00 + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_2x4_M2_TL1_END +.align 5 +.L_dgemm_2x4_M2_TL1: + KERNEL8x2x4 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_2x4_M2_TL1 +.L_dgemm_2x4_M2_TL1_END: + KERNEL8x2x4_END + +.L_dgemm_2x4_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_2x4_M2_L0 +.align 5 +.L_dgemm_2x4_M2_L71: + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvld U4, B0, 0x00 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_2x4_M2_L71 +.L_dgemm_2x4_M2_L0: + xvpackev.d D4, D1, D0 + xvpackod.d D5, D1, D0 + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U1, U3, 0x02 + + GSUB xvf, d, U0, U0, D4, U1, U1, D5 +.endm + +.macro dgemm_4x4 +.L_dgemm_4x4: + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + + xvldrepl.d U6, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U6 + + xvldrepl.d U7, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U7 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_dgemm_4x4_M4_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_4x4_M4_TL1_END +.align 5 +.L_dgemm_4x4_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_4x4_M4_TL1 +.L_dgemm_4x4_M4_TL1_END: + KERNEL8x4x4_END +.L_dgemm_4x4_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_4x4_M4_L0 +.align 5 +.L_dgemm_4x4_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_4x4_M4_L71 + .L_dgemm_4x4_M4_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 + + GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12 +.endm + +.macro dgemm_8x4 +.L_dgemm_8x4: + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + + xvldrepl.d U6, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + + xvldrepl.d U7, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_dgemm_8x4_M8_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_8x4_M8_TL1_END +.align 5 +.L_dgemm_8x4_M8_TL1: /* TL-- */ + KERNEL8x8x4 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_8x4_M8_TL1 + +.L_dgemm_8x4_M8_TL1_END: + KERNEL8x8x4_END + +.L_dgemm_8x4_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_8x4_M8_L0 +.align 5 +.L_dgemm_8x4_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_8x4_M8_L71 +.L_dgemm_8x4_M8_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 + + GSUB xvf, d, U0, U0, D0, U1, U1, D1, \ + U2, U2, D4, U3, U3, D5, \ + U4, U4, D8, U5, U5, D9, \ + U6, U6, D12, U7, U7, D13 +.endm + +.macro dgemm_4x2 +.L_dgemm_4x2: + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */ + beq ZERO,TL, .L_dgemm_4x2_N3_M4_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x10 + + beq ZERO, TL, .L_dgemm_4x2_N3_M4_TL1_END +.align 5 +.L_dgemm_4x2_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_4x2_N3_M4_TL1 +.L_dgemm_4x2_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_dgemm_4x2_N3_M4_L7: + /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_4x2_N3_M4_L0 +.align 5 +.L_dgemm_4x2_N3_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_4x2_N3_M4_L71 + +.L_dgemm_4x2_N3_M4_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + GSUB xvf, d, U0, U0, D0, U1, U1, D4 +.endm + +.macro dgemm_2x2 +.L_dgemm_2x2: + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */ + beq ZERO,TL, .L_dgemm_2x2_N3_M2_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x10 + + beq ZERO, TL, .L_dgemm_2x2_N3_M2_TL1_END +.align 5 +.L_dgemm_2x2_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_2x2_N3_M2_TL1 +.L_dgemm_2x2_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_dgemm_2x2_N3_M2_L7: + /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_2x2_N3_M2_L0 +.align 5 +.L_dgemm_2x2_N3_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_2x2_N3_M2_L71 +.L_dgemm_2x2_N3_M2_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + GSUB xvf, d, U0, U0, D0, U1, U1, D4 +.endm + +.macro dgemm_8x2 +.L_dgemm_8x2: + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */ + beq ZERO,TL, .L_dgemm_8x2_N3_M8_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 + + beq ZERO, TL, .L_dgemm_8x2_N3_M8_TL1_END +.align 5 +.L_dgemm_8x2_N3_M8_TL1: /* TL-- */ + KERNEL8x8x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_8x2_N3_M8_TL1 +.L_dgemm_8x2_N3_M8_TL1_END: + KERNEL8x8x2_END + +.L_dgemm_8x2_N3_M8_L7: + /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_8x2_N3_M8_L0 +.align 5 +.L_dgemm_8x2_N3_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_8x2_N3_M8_L71 + +.L_dgemm_8x2_N3_M8_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5 +.endm + +.macro dgemm_16x2 +.L_dgemm_16x2: + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_dgemm_16x2_N3_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x10 + + beq ZERO, TL, .L_dgemm_16x2_N3_TL1_END +.align 5 +.L_dgemm_16x2_N3_TL1: /* TL-- */ + KERNEL8x16x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_16x2_N3_TL1 +.L_dgemm_16x2_N3_TL1_END: + KERNEL8x16x2_END + +.L_dgemm_16x2_N3_L7: + /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_16x2_N3_L0 +.align 5 +.L_dgemm_16x2_N3_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_16x2_N3_L71 + +.L_dgemm_16x2_N3_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \ + U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7 +.endm + +.macro dgemm_2x1 +.L_dgemm_2x1: + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */ + beq ZERO,TL, .L_dgemm_2x1_N1_M2_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x08 + + beq ZERO, TL, .L_dgemm_2x1_N1_M2_TL1_END +.align 5 +.L_dgemm_2x1_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_2x1_N1_M2_TL1 +.L_dgemm_2x1_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_dgemm_2x1_N1_M2_L7: + /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_2x1_N1_M2_L0 +.align 5 +.L_dgemm_2x1_N1_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x08 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_2x1_N1_M2_L71 +.L_dgemm_2x1_N1_M2_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + GSUB xvf, d, U0, U0, D0 +.endm + +.macro dgemm_4x1 +.L_dgemm_4x1: + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */ + beq ZERO,TL, .L_dgemm_4x1_N1_M4_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x08 + + beq ZERO, TL, .L_dgemm_4x1_N1_M4_TL1_END +.align 5 +.L_dgemm_4x1_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_4x1_N1_M4_TL1 +.L_dgemm_4x1_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_dgemm_4x1_N1_M4_L7: + /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_4x1_N1_M4_L0 +.align 5 +.L_dgemm_4x1_N1_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x08 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_4x1_N1_M4_L71 +.L_dgemm_4x1_N1_M4_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + GSUB xvf, d, U0, U0, D0 +.endm + +.macro dgemm_8x1 +.L_dgemm_8x1: + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */ + beq ZERO,TL, .L_dgemm_8x1_N1_M8_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 + + beq ZERO, TL, .L_dgemm_8x1_N1_M8_TL1_END +.align 5 +.L_dgemm_8x1_N1_M8_TL1: /* TL-- */ + KERNEL8x8x1 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_8x1_N1_M8_TL1 + +.L_dgemm_8x1_N1_M8_TL1_END: + KERNEL8x8x1_END + +.L_dgemm_8x1_N1_M8_L7: + /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_8x1_N1_M8_L0 +.align 5 +.L_dgemm_8x1_N1_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_8x1_N1_M8_L71 +.L_dgemm_8x1_N1_M8_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + GSUB xvf, d, U0, U0, D0, U1, U1, D1 +.endm + +.macro dgemm_16x1 +.L_dgemm_16x1: + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */ + beq ZERO,TL, .L_dgemm_16x1_N1_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x08 + + beq ZERO, TL, .L_dgemm_16x1_N1_TL1_END +.align 5 +.L_dgemm_16x1_N1_TL1: /* TL-- */ + KERNEL8x16x1 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_16x1_N1_TL1 +.L_dgemm_16x1_N1_TL1_END: + KERNEL8x16x1_END + +.L_dgemm_16x1_N1_L7: + /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_16x1_N1_L0 +.align 5 +.L_dgemm_16x1_N1_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x08 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_16x1_N1_L71 +.L_dgemm_16x1_N1_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3 +.endm + +.macro dgemm_1x2 +.L_dgemm_1x2: // See dgemm_kernel_16x4.S + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_dgemm_1x2_N3_M1_L7 + + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + beq ZERO, TL, .L_dgemm_1x2_N3_M1_TL1_END +.L_dgemm_1x2_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_1x2_N3_M1_TL1 +.L_dgemm_1x2_N3_M1_TL1_END: + KERNEL8x1x2_END +.L_dgemm_1x2_N3_M1_L7: + /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_1x2_N3_M1_L0 +.L_dgemm_1x2_N3_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_dgemm_1x2_N3_M1_L71 +.L_dgemm_1x2_N3_M1_L0: + xvld U0, C0, 0x00 + xvld U1, C1, 0x00 + xvinsve0.d U0, U1, 0x01 + xvinsve0.d D0, D4, 0x01 + GSUB xvf, d, U0, U0, D0 +.endm + +.macro dgemm_1x1 +.L_dgemm_1x1: + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 +.L_N1_M1_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + GSUB xvf, d, U0, U0, D0 +.endm diff --git a/kernel/loongarch64/iamax_lasx.S b/kernel/loongarch64/iamax_lasx.S new file mode 100644 index 000000000..090da3004 --- /dev/null +++ b/kernel/loongarch64/iamax_lasx.S @@ -0,0 +1,542 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvadd.w VI1, VI1, VINC8 + xvfmaxa.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVFMAXA VM1, x1, x2 + XVCMPEQ VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + XVFMAXA VM0, x3, x4 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + XVFMAXA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + CMPEQ $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfmaxa.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM1, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + CMPEQ $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifdef DOUBLE + +#else +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + xvfmaxa.s VM0, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VX0 + xvbitsel.v VI0, VI0, VI1, VT0 + movfr2gr.s i0, $f20 +#endif + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + XVFMAXA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S new file mode 100644 index 000000000..ce5b3c724 --- /dev/null +++ b/kernel/loongarch64/iamax_lsx.S @@ -0,0 +1,482 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L16 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L17 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +#ifdef DOUBLE +.L16: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +#else +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + VFMAXA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/iamin_lasx.S b/kernel/loongarch64/iamin_lasx.S new file mode 100644 index 000000000..6ea117907 --- /dev/null +++ b/kernel/loongarch64/iamin_lasx.S @@ -0,0 +1,486 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 +#else + addi.d I, I, -1 + xvadd.w VI2, VI1, VINC8 + xvfmina.s VM1, VX0, VM0 +#endif + XVCMPEQ VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 +#endif + xvbitsel.v VINC4, VI2, VI1, VT0 + XVFMINA VM0, x4, x3 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + XVFMINA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI2, VI1, VINC8 + xvfmina.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM1, VM0 +#endif + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + xvfmina.s VM0, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VX0 + xvbitsel.v VI0, VI0, VI1, VT0 + movfr2gr.s i0, $f20 + +#endif + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + XVFMINA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S new file mode 100644 index 000000000..ce885fd88 --- /dev/null +++ b/kernel/loongarch64/iamin_lsx.S @@ -0,0 +1,446 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#ifdef DOUBLE +#define VINC2 $vr17 +#define VINC4 $vr18 +#else +#define VINC4 $vr17 +#define VINC8 $vr18 +#endif +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 +#else + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v x2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 +#endif + VCMPEQ VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vreplvei.d VI1, VI0, 0 + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 +#endif + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + VFMINA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + MTC $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S new file mode 100644 index 000000000..7800cb917 --- /dev/null +++ b/kernel/loongarch64/icamax_lasx.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + xvxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmax.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmax.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmax.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d I, I, -1 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.s x1, x1, x2 + xvfmax.s x3, VM0, x1 + xvfcmp.ceq.s VT0, x3, VM0 +#endif + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifdef DOUBLE +#else +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +#endif +.L21: //N<8 +#ifdef DOUBLE + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S new file mode 100644 index 000000000..a2fc9dbbd --- /dev/null +++ b/kernel/loongarch64/icamax_lsx.S @@ -0,0 +1,434 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + vxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 +#endif + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMAX x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + addi.d I, I, -1 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 + vfcmp.clt.s VT0, x1, VI3 + vfcmp.clt.s VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.s x1, x1, x2 + vfmax.s x3, VM0, x1 + vfcmp.ceq.s VT0, x3, VM0 +#endif + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +#ifdef DOUBLE +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 +#else +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamin_lasx.S b/kernel/loongarch64/icamin_lasx.S new file mode 100644 index 000000000..01abd45b2 --- /dev/null +++ b/kernel/loongarch64/icamin_lasx.S @@ -0,0 +1,555 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + xvreplve0.w VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmin.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmin.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmin.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + XVFMUL x3, VI4, x1 + XVFMUL x4, VI4, x2 + XVCMPLT VT0, x1, VI3 + XVCMPLT VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 +#endif + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S new file mode 100644 index 000000000..a08cd33c5 --- /dev/null +++ b/kernel/loongarch64/icamin_lsx.S @@ -0,0 +1,425 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 + vreplvei.w VM0, VM0, 0 + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 +#endif + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/imax_lasx.S b/kernel/loongarch64/imax_lasx.S new file mode 100644 index 000000000..2d3d5e9d3 --- /dev/null +++ b/kernel/loongarch64/imax_lasx.S @@ -0,0 +1,533 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VM0, VX0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + addi.d X, X, 8 * SIZE +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 +#endif + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE +#endif + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX +#ifdef DOUBLE + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VM0, VX0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifndef DOUBLE +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 +#endif + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/imax_lsx.S b/kernel/loongarch64/imax_lsx.S new file mode 100644 index 000000000..92556d4e6 --- /dev/null +++ b/kernel/loongarch64/imax_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x1, x2 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/imin_lasx.S b/kernel/loongarch64/imin_lasx.S new file mode 100644 index 000000000..5306828e2 --- /dev/null +++ b/kernel/loongarch64/imin_lasx.S @@ -0,0 +1,534 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else + xvadd.w VI1, VI1, VINC8 + XVCMPLT VT0, VX0, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + addi.d X, X, 8 * SIZE +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + XVCMPLT VT0, VX0, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + MTG i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifndef DOUBLE +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + XVCMPLT VT0, x2, x1 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, x1, VM0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + CMPLT $fcc0, $f13, $f15 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + MTG i0, $f20 +#endif + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/imin_lsx.S b/kernel/loongarch64/imin_lsx.S new file mode 100644 index 000000000..a0c411e7a --- /dev/null +++ b/kernel/loongarch64/imin_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + addi.d X, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x2, x1 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S new file mode 100644 index 000000000..d097b3045 --- /dev/null +++ b/kernel/loongarch64/loongarch64_asm.S @@ -0,0 +1,673 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#if __loongarch_grlen == 64 +#define LA_REG int64_t +#define REG_SIZE 8 +#define REG_LOG 3 +#define PTR_ADDI addi.d +#define PTR_ADD add.d +#define PTR_SUB sub.d +#define PTR_LD ld.d +#define PTR_ST st.d +#define PTR_SLLI slli.d +#define PTR_SRLI srli.d +#define PTR_SRAI srai.d +#define PTR_MUL mul.d +#define PTR_ALSL alsl.d +#elif __loongarch_grlen == 32 +#define LA_REG int32_t +#define REG_SIZE 4 +#define REG_LOG 2 +#define PTR_ADDI addi.w +#define PTR_ADD add.w +#define PTR_SUB sub.w +#define PTR_LD ld.w +#define PTR_ST st.w +#define PTR_SLLI slli.w +#define PTR_SRLI srli.w +#define PTR_SRAI srai.w +#define PTR_MUL mul.w +#define PTR_ALSL alsl.w +#else +// If neither of the above two conditions is supported, it means this is an early +// internal toolchain. To ensure maximum compatibility, the following approach is taken: +#define LA_REG int64_t +#define REG_SIZE 8 +#define REG_LOG 3 +#define PTR_ADDI addi.d +#define PTR_ADD add.d +#define PTR_SUB sub.d +#define PTR_LD ld.d +#define PTR_ST st.d +#define PTR_SLLI slli.d +#define PTR_SRLI srli.d +#define PTR_SRAI srai.d +#define PTR_MUL mul.d +#define PTR_ALSL alsl.d +#endif + +#if __loongarch_frlen == 64 +#define FREG_SIZE 8 +#define FREG_LOG 3 +#define PTR_FLD fld.d +#define PTR_FST fst.d +#elif __loongarch_frlen == 32 +#define FREG_SIZE 4 +#define FREG_LOG 2 +#define PTR_FLD fld.s +#define PTR_FST fst.s +#else +// If neither of the above two conditions is supported, it means this is an early +// internal toolchain. To ensure maximum compatibility, the following approach is taken: +#define FREG_SIZE 8 +#define FREG_LOG 3 +#define PTR_FLD fld.d +#define PTR_FST fst.d +#endif + +// The max registers available to the user which +// do not need to be preserved across calls. +// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html +#define MAX_INT_CALLER_SAVED 17 +#define MAX_FP_CALLER_SAVED 24 + +.altmacro // Enable alternate macro mode + +.macro push_if_used regs, fregs +.if \regs > MAX_INT_CALLER_SAVED + PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) + push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 +.endif +.if \fregs > MAX_FP_CALLER_SAVED + PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) + push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 +.endif +.endm // End push_if_used +.macro pop_if_used regs, fregs +.if \fregs > MAX_FP_CALLER_SAVED + pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 + PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG +.endif +.if \regs > MAX_INT_CALLER_SAVED + pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 + PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG +.endif +.endm // End pop_if_used +.macro push_regs from, to + PTR_ST $s\()\from, $sp, \from << REG_LOG +.if \to - \from + push_regs %from + 1, \to +.endif +.endm // End push_regs +.macro pop_regs from, to + PTR_LD $s\()\from, $sp, \from << REG_LOG +.if \to - \from + pop_regs %from + 1, \to +.endif +.endm // End pop_regs +.macro push_fregs from, to + PTR_FST $fs\()\from, $sp, \from << FREG_LOG +.if \to - \from + push_fregs %from + 1, \to +.endif +.endm // End push_fregs +.macro pop_fregs from, to + PTR_FLD $fs\()\from, $sp, \from << FREG_LOG +.if \to - \from + pop_fregs %from + 1, \to +.endif +.endm // End pop_fregs + +// +// Instruction Related Macros +// +// GLD +// +.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ld \out, \src, \offset +.else + \pre_op\()ld.\suf_op \out, \src, \offset +.endif +.ifnb \more + GLD \pre_op, \suf_op, \more +.endif +.endm + +// +// GLD_INC +// +.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ld \out, \src, \offset +.else + \pre_op\()ld.\suf_op \out, \src, \offset +.endif + PTR_ADDI \src, \src, \inc +.ifnb \more + GLD_INC \pre_op, \suf_op, \inc, \more +.endif +.endm +// +// GLDX is same as GLD except the stride is a register +// +.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ldx \out, \src, \offset +.else + \pre_op\()ldx.\suf_op \out, \src, \offset +.endif +.ifnb \more + GLDX \pre_op, \suf_op, \more +.endif +.endm +// +// GLDREPL +// +.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg + \pre_op\()ldrepl.\suf_op \out, \src, \offset +.ifnb \more + GLDREPL \pre_op, \suf_op, \more +.endif +.endm +// +// GST +// +.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()st \src, \dst, \offset +.else + \pre_op\()st.\suf_op \src, \dst, \offset +.endif +.ifnb \more + GST \pre_op, \suf_op, \more +.endif +.endm +// +// GMUL +// +.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()mul.\suf_op \out, \in0, \in1 +.ifnb \more + GMUL \pre_op, \suf_op, \more +.endif +.endm +// +// GMADD +// +.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg + \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 +.ifnb \more + GMADD \pre_op, \suf_op, \more +.endif +.endm +// +// GADD +// +.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()add.\suf_op \out, \in0, \in1 +.ifnb \more + GADD \pre_op, \suf_op, \more +.endif +.endm +// +// GADDI +// +.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()addi.\suf_op \out, \in0, \in1 +.ifnb \more + GADDI \pre_op, \suf_op, \more +.endif +.endm +// +// GSUB +// +.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()sub.\suf_op \out, \in0, \in1 +.ifnb \more + GSUB \pre_op, \suf_op, \more +.endif +.endm +// +// GSLLI +// +.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()slli.\suf_op \out, \in0, \in1 +.ifnb \more + GSLLI \pre_op, \suf_op, \more +.endif +.endm +// +// GINSVE0 +// +.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()insve0.\suf_op \out, \in0, \in1 +.ifnb \more + GINSVE0 \pre_op, \suf_op, \more +.endif +.endm +// +// GXOR +// +.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()xor.\suf_op \out, \in0, \in1 +.ifnb \more + GXOR \pre_op, \suf_op, \more +.endif +.endm +// +// GPERMI +// +.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()permi.\suf_op \out, \in0, \in1 +.ifnb \more + GPERMI \pre_op, \suf_op, \more +.endif +.endm +// +// GNMSUB +// +.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg + \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 +.ifnb \more + GNMSUB \pre_op, \suf_op, \more +.endif +.endm +// +// GPRELD +// +.macro GPRELD in0:req, in1:req, in2:req, more:vararg + preld \in0, \in1, \in2 +.ifnb \more + GPRELD \more +.endif +.endm + +// +// Compound instructions +// +// GACC: Accumulate the values of vector registers +// +.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifeqs "\suf_op", "s" + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "vf" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifeqs "\suf_op", "s" + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "xv" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "d" + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "w" + xvpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "h" + xvpackod.b \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif +.endif +.endif + +.ifeqs "\pre_op", "v" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "d" + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "w" + vpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "h" + vpackod.b \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif +.endif +.endif + +.ifnb \more + GACC \pre_op, \suf_op, \more +.endif +.endm +// +// GMOV +// +.macro GMOV pre_op:req, out:req, in:req, more:vararg + \pre_op\()or.v \out, \in, \in +.ifnb \more + GMOV \pre_op, \more +.endif +.endm + +// +// GCOMPLEXACC: Complex accumulate the values of vector registers +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// Note: When "pre_op = xvf && suf_op = s", in will be modified. +// +.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvpermi.q \out, \in, 0x01 +.ifeqs "\suf_op", "s" + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.else + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "vf" +.ifeqs "\suf_op", "s" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.else + vor.v \out, \in, \in +.endif +.endif + + +.ifnb \more + GCOMPLEXACC \pre_op, \suf_op, \more +.endif +.endm + +// +// GCOMPLEXMUL: Complex multiplication, out = in0 * in1 +// xconj: default value 0. +// if !(xconj) +// out_r = in0_r * in1_r - in0_i * in1_i; +// out_i = in0_r * in1_i + in0_i * in1_r; +// else +// out_r = in0_r * in1_r + in0_i * in1_i; +// out_i = in0_r * in1_i - in0_i * in1_r; +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// +.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + xvpackev.w \tmp0, \in0, \in0 +.else + xvpackev.d \tmp0, \in0, \in0 +.endif +.else + vxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + vpackev.w \tmp0, \in0, \in0 +.else + vpackev.d \tmp0, \in0, \in0 +.endif +.endif + + \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 + +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \tmp1 +.else + xvpackod.w \tmp1, \tmp1, \in0 +.endif + xvshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \tmp1 +.else + xvpackod.d \tmp1, \tmp1, \in0 +.endif + xvshuf4i.d \tmp2, \in1, 0x0b +.endif +.else +.ifeqs "\suf_op", "s" +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \tmp1 +.else + vpackod.w \tmp1, \tmp1, \in0 +.endif + vshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \tmp1 +.else + vpackod.d \tmp1, \tmp1, \in0 +.endif + vshuf4i.d \tmp2, \in1, 0x0b +.endif +.endif + + \pre_op\()mul.\suf_op \out, \tmp0, \in1 + \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out + +.ifnb \more + GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more +.endif +.endm + +// +// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2 +// xconj: default value 0 +// conj: default value 0 +// if !(CONJ) +// if !(XCONJ) +// out_r = in0_r * in1_r - in0_i * in1_i + in2_r; +// out_i = in0_r * in1_i + in0_i * in1_r + in2_i; +// else +// out_r = in0_r * in1_r + in0_i * in1_i + in2_r; +// out_i = in0_r * in1_i - in0_i * in1_r + in2_i; +// else +// if !(XCONJ) +// out_r = in0_r * in1_r + in0_i * in1_i + in2_r; +// out_i = in2_i - (in0_r * in1_i - in0_i * in1_r); +// else +// out_r = in0_r * in1_r - in0_i * in1_i + in2_r; +// out_i = in2_i - (in0_r * in1_i + in0_i * in1_r); +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// +.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + xvpackev.w \tmp0, \in0, \in0 +.else + xvpackev.d \tmp0, \in0, \in0 +.endif +.else + vxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + vpackev.w \tmp0, \in0, \in0 +.else + vpackev.d \tmp0, \in0, \in0 +.endif +.endif + + \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 +.ifeqs "\conj", "1" + \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" + xvshuf4i.w \tmp0, \tmp0, 0xb1 + xvpackev.w \out, \tmp0, \tmp2 +.else + xvshuf4i.d \tmp0, \tmp0, 0x0b + xvpackev.d \out, \tmp0, \tmp2 +.endif +.else +.ifeqs "\suf_op", "s" + vshuf4i.w \tmp0, \tmp0, 0xb1 + vpackev.w \out, \tmp0, \tmp2 +.else + vshuf4i.d \tmp0, \tmp0, 0x0b + vpackev.d \out, \tmp0, \tmp2 +.endif +.endif /* pre_op = xvf */ +.else + \pre_op\()add.\suf_op \out, \tmp2, \tmp1 +.endif /* conj = 1 */ + + \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 + +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \tmp1 +.else + xvpackod.w \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \in0 +.else + xvpackod.w \tmp1, \tmp1, \tmp1 +.endif +.endif + xvshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \tmp1 +.else + xvpackod.d \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \in0 +.else + xvpackod.d \tmp1, \tmp1, \tmp1 +.endif +.endif + xvshuf4i.d \tmp2, \in1, 0x0b +.endif +.else +.ifeqs "\suf_op", "s" +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \tmp1 +.else + vpackod.w \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \in0 +.else + vpackod.w \tmp1, \tmp1, \tmp1 +.endif +.endif + vshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \tmp1 +.else + vpackod.d \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \in0 +.else + vpackod.d \tmp1, \tmp1, \tmp1 +.endif +.endif + vshuf4i.d \tmp2, \in1, 0x0b +.endif +.endif + + \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out + +.ifnb \more + GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more +.endif +.endm + +// +// Media Related Macros +// +.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 + \pre_op\()ilvl.\suf_op \out0, \in0, \in1 + \pre_op\()ilvh.\suf_op \out1, \in0, \in1 +.endm +.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 + \pre_op\()pickev.\suf_op \out0, \in0, \in1 + \pre_op\()pickod.\suf_op \out1, \in0, \in1 +.endm + +// +// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, +// has no pre_op param. 128-bit vector instructions are not supported. +// +.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ + vt0, vt1 + GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 + GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 + GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 + GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 +.endm + +.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ + in0, in1, in2, in3, in4, in5, in6, in7, \ + tmp0, tmp1, tmp2, tmp3 + GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 + GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 + GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 + GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 + + GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 + GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 + GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 + GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 + + GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 + + GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ + \out2, \out6, 0x02, \out3, \out7, 0x02, \ + \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ + \out6, \tmp2, 0x31, \out7, \tmp3, 0x31 +.endm diff --git a/kernel/loongarch64/max_lasx.S b/kernel/loongarch64/max_lasx.S new file mode 100644 index 000000000..3215ae394 --- /dev/null +++ b/kernel/loongarch64/max_lasx.S @@ -0,0 +1,229 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAX VM1, VX0, VX1 + XVFMAX VM2, VX2, VX3 + XVFMAX VM0, VM0, VM1 + XVFMAX VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAX VM1, VX0, VX1 + XVFMAX VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAX VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAX VM1, VX0, VX1 + XVFMAX VM2, VX2, VX3 + XVFMAX VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAX VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAX $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAX VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAX VM1, VX0, VX1 + XVFMAX VM2, VX2, VX3 + XVFMAX VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAX VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAX $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/max_lsx.S b/kernel/loongarch64/max_lsx.S new file mode 100644 index 000000000..a2100875d --- /dev/null +++ b/kernel/loongarch64/max_lsx.S @@ -0,0 +1,228 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAX VM1, VX0, VX1 + VFMAX VM2, VX2, VX3 + VFMAX VM0, VM0, VM1 + VFMAX VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAX VM1, VX0, VX1 + VFMAX VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAX VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAX VM1, VX0, VX1 + VFMAX VM2, VX2, VX3 + VFMAX VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAX $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAX VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAX VM1, VX0, VX1 + VFMAX VM2, VX2, VX3 + VFMAX VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAX $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/min_lasx.S b/kernel/loongarch64/min_lasx.S new file mode 100644 index 000000000..890c8882d --- /dev/null +++ b/kernel/loongarch64/min_lasx.S @@ -0,0 +1,229 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMIN VM1, VX0, VX1 + XVFMIN VM2, VX2, VX3 + XVFMIN VM0, VM0, VM1 + XVFMIN VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMIN VM1, VX0, VX1 + XVFMIN VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMIN VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMIN VM1, VX0, VX1 + XVFMIN VM2, VX2, VX3 + XVFMIN VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMIN VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMIN $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMIN VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMIN VM1, VX0, VX1 + XVFMIN VM2, VX2, VX3 + XVFMIN VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMIN VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMIN $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/min_lsx.S b/kernel/loongarch64/min_lsx.S new file mode 100644 index 000000000..1bc32a0ed --- /dev/null +++ b/kernel/loongarch64/min_lsx.S @@ -0,0 +1,228 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMIN VM1, VX0, VX1 + VFMIN VM2, VX2, VX3 + VFMIN VM0, VM0, VM1 + VFMIN VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMIN VM1, VX0, VX1 + VFMIN VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMIN VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMIN VM1, VX0, VX1 + VFMIN VM2, VX2, VX3 + VFMIN VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMIN $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMIN VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMIN VM1, VX0, VX1 + VFMIN VM2, VX2, VX3 + VFMIN VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMIN $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S new file mode 100644 index 000000000..71378e0b2 --- /dev/null +++ b/kernel/loongarch64/rot_lasx.S @@ -0,0 +1,1602 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, C + xvreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + xvreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + xvreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + xvreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + xvreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE +#endif + XVMUL VT0, VX0, VXC + XVMUL VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVMUL VT1, VX3, VXC + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE +#endif + XVMUL VT0, VX2, VXS + XVMUL VT1, VX0, VXS + XVFSUB VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE +#ifdef DOUBLE + XVMUL VT0, VX3, VXS + XVMUL VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 +#else + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvld VX1, X, 4 * SIZE + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 0 * SIZE + xvst VXZ, X, 4 * SIZE + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 +#else + xvst VXZ, X, 0 * SIZE + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L124 + move Y, YY + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXC, VX0 + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VXS, VX0 + xvfmsub.d VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXC, VX0 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX2, VXC + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX1, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX3, VXS + xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXS, VX2 + xvfmul.d VT1, VXS, VX0 + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXS, VX2 + xvfmul.s VT1, VXS, VX0 + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE + addi.d I, I, -1 + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S new file mode 100644 index 000000000..3bb77aaec --- /dev/null +++ b/kernel/loongarch64/rot_lsx.S @@ -0,0 +1,1790 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, C + vreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + vreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + vreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + vreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE +#endif + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE +#ifdef DOUBLE + vst VXZ, X, 6 * SIZE + vst VXZ, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE +#endif + add.d Y, Y, INCY +#ifndef DOUBLE + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L124 +#ifdef DOUBLE + move Y, YY +#endif + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 +#endif + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 +#ifdef DOUBLE + move X, XX +#endif + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#ifndef DOUBLE + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE + addi.d I, I, -1 + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S new file mode 100644 index 000000000..48e2c0718 --- /dev/null +++ b/kernel/loongarch64/scal_lasx.S @@ -0,0 +1,282 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA, TEMP +#else + xvreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX1, VALPHA + xvstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VT0, VX0, VALPHA + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE +#ifdef DOUBLE + xvst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA , TEMP +#else + xvreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VT0, VX0, VALPHA + xvfmul.d VT1, VX1, VALPHA + xvst VT0, X, 0 * SIZE + xvst VT1, X, 4 * SIZE +#else + xvfmul.s VT0, VX0, VALPHA + xvst VT0, X, 0 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S new file mode 100644 index 000000000..1ffce7db2 --- /dev/null +++ b/kernel/loongarch64/scal_lsx.S @@ -0,0 +1,301 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA, TEMP +#else + vreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 + +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + vfmul.s VT0, VX0, VALPHA + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vfmul.s VT1, VX1, VALPHA + vstelm.w VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 3 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 + +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE +#ifdef DOUBLE + vst VX0, X, 2 * SIZE + vst VX0, X, 4 * SIZE + vst VX0, X, 6 * SIZE +#else + vst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA , TEMP +#else + vreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vfmul.d VT1, VX1, VALPHA + vld VX0, X, 4 * SIZE + vst VT0, X, 0 * SIZE + vst VT1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vld VX1, X, 6 * SIZE + vst VT0, X, 4 * SIZE + vfmul.d VT1, VX1, VALPHA + vst VT1, X, 6 * SIZE + addi.d I, I, -1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VT0, VX0, VALPHA + vfmul.s VT1, VX1, VALPHA + addi.d I, I, -1 + vst VT0, X, 0 * SIZE + vst VT1, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S new file mode 100644 index 000000000..bd609394e --- /dev/null +++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S @@ -0,0 +1,2348 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2023/08/23 guxiwei +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 8 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 256 +* SGEMM_DEFAULT_Q 256 +* SGEMM_DEFAULT_R 1024 +* A_PRE 1024 +* B_PRE 256 // Enable prefetching for B results in a performance decrease, temporarily disabled. +* +* +* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: +* 1 thread: 71.7 GFLOPS +* 2 threads: 142.6 GFLOPS +* 3 threads: 211.5 GFLOPS +* 4 threads: 265.0 GFLOPS +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define C4 $r25 +#define C5 $r26 +#define C6 $r27 +#define C7 $r28 +#define T0 $r29 +#define T1 $r30 +#undef ZERO +#define ZERO $r0 + +/* LASX Vectors + * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data. + * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data. + * Use D0 to D15 to store intermediate values of the computation. + * Use VALPHA to store the broadcast value of alpha + */ +#define U0 $xr0 +#define U1 $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define D0 $xr10 +#define D1 $xr11 +#define D2 $xr12 +#define D3 $xr13 +#define D4 $xr14 +#define D5 $xr15 +#define D6 $xr16 +#define D7 $xr17 +#define D8 $xr18 +#define D9 $xr19 +#define D10 $xr20 +#define D11 $xr21 +#define D12 $xr22 +#define D13 $xr23 +#define D14 $xr24 +#define D15 $xr25 +#define VALPHA $xr26 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +// Loops outline: +// .L_N8 <-------------------------------------------------------------------------------------------- /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */ +// | .L_M16 <--------------------- | /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */ +// | | .L_M16_TL1 | | +// | | .L_M16_L7 | The entire core loop of the function, KERNEK16x8 | +// | | .L_M16_L71 | | +// | | .L_M16_L0 ---------------- | +// | .L_M8 | +// | | .L_M8_TL1 | | +// | | .L_M8_L7 | KERNEK8x8 | +// | | .L_M8_L71 | | +// | | .L_M8_L0 | | +// | .L_M4 | +// | | .L_M4_TL1 | | +// | | .L_M4_L7 | KERNEK4x8 | +// | | .L_M4_L71 | | +// | | .L_M4_L0 | | +// | .L_M2 | +// | | .L_M2_TL1 | | +// | | .L_M2_L7 | KERNEK2x8 | +// | | .L_M2_L71 | | +// | | .L_M2_L0 | | +// | .L_M1 | +// | | .L_M1_TL1 | | +// | | .L_M1_L7 | KERNEK1x8 | +// | | .L_M1_L71 | | +// | | .L_M1_L0 | | +// | .L_M0------------------------------------------------------------------------------------------ +// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */ +// .L_N4 +// | .L_N4_M16 <--------------------- +// | | .L_N4_M16_TL1 | +// | | .L_N4_M16_L7 | KERNEL16x4 +// | | .L_N4_M16_L71 | +// | | .L_N4_M16_L0 ---------------- +// | .L_N4_M8 +// | | .L_N4_M8_TL1 | +// | | .L_N4_M8_L7 | KERNEL8x4 +// | | .L_N4_M8_L71 | +// | | .L_N4_M8_L0 | +// | .L_N4_M4 +// | | .L_N4_M4_TL1 | +// | | .L_N4_M4_L7 | KERNEL4x4 +// | | .L_N4_M4_L71 | +// | | .L_N4_M4_L0 | +// | .L_N4_M2 +// | | .L_N4_M2_TL1 | +// | | .L_N4_M2_L7 | KERNEL2x4 +// | | .L_N4_M2_L71 | +// | | .L_N4_M2_L0 | +// | .L_N4_M1 +// | | .L_N4_M1_TL1 | +// | | .L_N4_M1_L7 | KERNEL1x4 +// | | .L_N4_M1_L71 | +// | | .L_N4_M1_L0 | +// | .L_N4_M0 +// .L_N3 /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */ +// .L_N2 +// | .L_N2_M16 <--------------------- +// | | .L_N2_M16_TL1 | +// | | .L_N2_M16_L7 | KERNEL16x2 +// | | .L_N2_M16_L71 | +// | | .L_N2_M16_L0 ---------------- +// | .L_N2_M8 +// | | .L_N2_M8_TL1 | +// | | .L_N2_M8_L7 | KERNEL8x2 +// | | .L_N2_M8_L71 | +// | | .L_N2_M8_L0 | +// | .L_N2_M4 +// | | .L_N2_M4_TL1 | +// | | .L_N2_M4_L7 | KERNEL4x2 +// | | .L_N2_M4_L71 | +// | | .L_N2_M4_L0 | +// | .L_N2_M2 +// | | .L_N2_M2_TL1 | +// | | .L_N2_M2_L7 | KERNEL2x2 +// | | .L_N2_M2_L71 | +// | | .L_N2_M2_L0 | +// | .L_N2_M1 +// | | .L_N2_M1_TL1 | +// | | .L_N2_M1_L7 | KERNEL1x2 +// | | .L_N2_M1_L71 | +// | | .L_N2_M1_L0 | +// | .L_N2_M0 +// .L_N1 +// | .L_N1_M16 <--------------------- +// | | .L_N1_M16_TL1 | +// | | .L_N1_M16_L7 | KERNEL16x1 +// | | .L_N1_M16_L71 | +// | | .L_N1_M16_L0 ---------------- +// | .L_N1_M8 +// | | .L_N1_M8_TL1 | +// | | .L_N1_M8_L7 | KERNEL8x1 +// | | .L_N1_M8_L71 | +// | | .L_N1_M8_L0 | +// | .L_N1_M4 +// | | .L_N1_M4_TL1 | +// | | .L_N1_M4_L7 | KERNEL4x1 +// | | .L_N1_M4_L71 | +// | | .L_N1_M4_L0 | +// | .L_N1_M2 +// | | .L_N1_M2_TL1 | +// | | .L_N1_M2_L7 | KERNEL2x1 +// | | .L_N1_M2_L71 | +// | | .L_N1_M2_L0 | +// | .L_N1_M1 +// | | .L_N1_M1_TL1 | +// | | .L_N1_M1_L7 | KERNEL1x1 +// | | .L_N1_M1_L71 | +// | | .L_N1_M1_L0 | +// | .L_N1_M0 +// .L_N0 + +/*************** sgemm_kernel_macros ***************/ +.macro KERNEL1x16x8_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D1, U1, X0 + preld 0, C0, 0x00 + GMUL xvf, s, D2, U0, X1, D3, U1, X1 + preld 0, C1, 0x00 + GMUL xvf, s, D4, U0, X2, D5, U1, X2 + preld 0, C2, 0x00 + GMUL xvf, s, D6, U0, X3, D7, U1, X3 + preld 0, C3, 0x00 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMUL xvf, s, D8, U0, X4, D9, U1, X4 + preld 0, C4, 0x00 + GMUL xvf, s, D10, U0, X5, D11, U1, X5 + preld 0, C5, 0x00 + GMUL xvf, s, D12, U0, X6, D13, U1, X6 + preld 0, C6, 0x00 + GMUL xvf, s, D14, U0, X7, D15, U1, X7 + preld 0, C7, 0x00 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL1x16x8 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3 + preld 0, A0, A_PRE + GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \ + D6, U0, X3, D6, D7, U1, X3 D7 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMADD xvf, s, D8, U0, X4, D8, D9, U1, X4, D9, \ + D10, U0, X5, D10, D11, U1, X5, D11 + //preld 0, B0, B_PRE + GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \ + D14, U0, X7, D14, D15, U1, X7 D15 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL8x16x8 +.rept 8 + KERNEL1x16x8 +.endr +.endm + +.macro SAVE16x8 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 + /* Load C2 */ + GLD xv, , X4, C2, 0x00, X5, C2, 0x20 + GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 + /* Load C3 */ + GLD xv, , X6, C3, 0x00, X7, C3, 0x20 + GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 + /* Load C4 */ + GLD xv, , X0, C4, 0x00, X1, C4, 0x20 + GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1 + /* Load C5 */ + GLD xv, , X2, C5, 0x00, X3, C5, 0x20 + GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3 + /* Load C6 */ + GLD xv, , X4, C6, 0x00, X5, C6, 0x20 + GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5 + /* Load C7 */ + GLD xv, , X6, C7, 0x00, X7, C7, 0x20 + GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20, \ + D4, C2, 0x00, D5, C2, 0x20, \ + D6, C3, 0x00, D7, C3, 0x20, \ + D8, C4, 0x00, D9, C4, 0x20, \ + D10, C5, 0x00, D11, C5, 0x20, \ + D12, C6, 0x00, D13, C6, 0x20, \ + D14, C7, 0x00, D15, C7, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#elif __loongarch_grlen == 32 + GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#else + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx8_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ + D4, U0, X2, D6, U0, X3 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMUL xvf, s, D8, U0, X4, D10, U0, X5, \ + D12, U0, X6, D14, U0, X7 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL1xMx8 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ + D4, U0, X2, D4, D6, U0, X3, D6 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMADD xvf, s, D8, U0, X4, D8, D10, U0, X5, D10, \ + D12, U0, X6, D12, D14, U0, X7, D14 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL8xMx8 m, stride +.rept 8 + KERNEL1xMx8 \m, \stride +.endr +.endm + +.macro SAVEMx8 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ + D4, D4, VALPHA, D6, D6, VALPHA, \ + D8, D8, VALPHA, D10, D10, VALPHA, \ + D12, D12, VALPHA, D14, D14, VALPHA +#else + /* Load C0, C1, C2, C3, C4, C5, C6, C7 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ + D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 +.if \m == 8 + GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00 +.elseif \m == 4 + GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 +.endif + GMADD xvf, s, D8, D8, VALPHA, X0, D10, D10, VALPHA, X2, \ + D12, D12, VALPHA, X4, D14, D14, VALPHA, X6 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ + D4, C2, 0x00, D6, C3, 0x00, \ + D8, C4, 0x00, D10, C5, 0x00, \ + D12, C6, 0x00, D14, C7, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ + $vr14, C2, 0x00, $vr16, C3, 0x00, \ + $vr18, C4, 0x00, $vr20, C5, 0x00, \ + $vr22, C6, 0x00, $vr24, C7, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00, \ + $f18, C4, 0x00, $f20, C5, 0x00, \ + $f22, C6, 0x00, $f24, C7, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00, \ + $f18, C4, 0x00, $f20, C5, 0x00, \ + $f22, C6, 0x00, $f24, C7, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#elif __loongarch_grlen == 32 + GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#else + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#endif +.endm + +.macro KERNEL1x16x4_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ + D2, U0, X1, D3, U1, X1, \ + D4, U0, X2, D5, U1, X2, \ + D6, U0, X3, D7, U1, X3 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL1x16x4 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3, \ + D4, U0, X2, D4, D5, U1, X2, D5, \ + D6, U0, X3, D6, D7, U1, X3 D7 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL8x16x4 +.rept 8 + KERNEL1x16x4 +.endr +.endm + +.macro SAVE16x4 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 + /* Load C2 */ + GLD xv, , X4, C2, 0x00, X5, C2, 0x20 + GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 + /* Load C3 */ + GLD xv, , X6, C3, 0x00, X7, C3, 0x20 + GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20, \ + D4, C2, 0x00, D5, C2, 0x20, \ + D6, C3, 0x00, D7, C3, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#elif __loongarch_grlen == 32 + GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#else + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx4_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ + D4, U0, X2, D6, U0, X3 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL1xMx4 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ + D4, U0, X2, D4, D6, U0, X3, D6 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL8xMx4 m, stride +.rept 8 + KERNEL1xMx4 \m, \stride +.endr +.endm + +.macro SAVEMx4 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ + D4, D4, VALPHA, D6, D6, VALPHA +#else + /* Load C0, C1, C2, C3 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ + D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ + D4, C2, 0x00, D6, C3, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ + $vr14, C2, 0x00, $vr16, C3, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#elif __loongarch_grlen == 32 + GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#else + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#endif +.endm + +.macro KERNEL1x16x2_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ + D2, U0, X1, D3, U1, X1 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL1x16x2 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL8x16x2 +.rept 8 + KERNEL1x16x2 +.endr +.endm + +.macro SAVE16x2 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40 +#elif __loongarch_grlen == 32 + GADDI , w, C0, C0, 0x40, C1, C1, 0x40 +#else + GADDI , d, C0, C0, 0x40, C1, C1, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx2_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMUL xvf, s, D0, U0, X0, D2, U0, X1 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL1xMx2 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL8xMx2 m, stride +.rept 8 + KERNEL1xMx2 \m, \stride +.endr +.endm + +.macro SAVEMx2 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA +#else + /* Load C0, C1 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride +#elif __loongarch_grlen == 32 + GADDI , w, C0, C0, \stride, C1, C1, \stride +#else + GADDI , d, C0, C0, \stride, C1, C1, \stride +#endif +.endm + +.macro KERNEL1x16x1_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + GLDREPL xv, w, X0, B0, 0x00 + GMUL xvf, s, D0, U0, X0, D1, U1, X0 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL1x16x1 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + GLDREPL xv, w, X0, B0, 0x00 + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL8x16x1 +.rept 8 + KERNEL1x16x1 +.endr +.endm + +.macro SAVE16x1 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40 +#elif __loongarch_grlen == 32 + GADDI , w, C0, C0, 0x40 +#else + GADDI , d, C0, C0, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx1_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00 + GMUL xvf, s, D0, U0, X0 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL1xMx1 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00 + GMADD xvf, s, D0, U0, X0, D0 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL8xMx1 m, stride +.rept 8 + KERNEL1xMx1 \m, \stride +.endr +.endm + +.macro SAVEMx1 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA +#else + /* Load C0, C1 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride +#elif __loongarch_grlen == 32 + GADDI , w, C0, C0, \stride +#else + GADDI , d, C0, C0, \stride +#endif +.endm + + PROLOGUE + push_if_used 26, 32 + xvreplve0.w VALPHA, $xr0 +#if defined (TRMMKERNEL) && !defined(LEFT) + PTR_SUB OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + /* if (!(N >> 3)) goto L_N7 */ + PTR_SRAI J, N, 3 /* J = bn >> 3 */ + andi N, N, 0x07 + beq ZERO, J, .L_N7 +.L_N8: /* J -- */ + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 + PTR_ADDI J, J, -1 /* J-- */ +#if __loongarch_grlen == 64 + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#elif __loongarch_grlen == 32 + GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#else + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#endif +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + /* if (!(M >> 4)) goto L_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 +.align 5 +.L_M16: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x8_START + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M16_L7 */ + beq ZERO,TL, .L_M16_L7 +.align 5 +.L_M16_TL1: + KERNEL8x16x8 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M16_TL1 +.L_M16_L7: + andi TL, L, 7 + beq TL, ZERO,.L_M16_L0 +.align 5 +.L_M16_L71: + KERNEL1x16x8 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M16_L71 +.L_M16_L0: + SAVE16x8 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -16 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 /* number of values in A */ +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_M16 +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 /* A0 += 8 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + KERNEL1xMx8_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 +.align 5 +.L_M8_TL1: + KERNEL8xMx8 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 +.align 5 +.L_M8_L71: + KERNEL1xMx8 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M8_L71 +.L_M8_L0: + SAVEMx8 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -8 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 /* A0 += 4 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 +.align 5 +.L_M4_TL1: + KERNEL8xMx8 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 +.L_M4_L71: + KERNEL1xMx8 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M4_L71 +.L_M4_L0: + SAVEMx8 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -4 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 2, 0x08 + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 +.align 5 +.L_M2_TL1: + KERNEL8xMx8 2, 0x08 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 +.align 5 +.L_M2_L71: + KERNEL1xMx8 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M2_L71 +.L_M2_L0: + SAVEMx8 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -2 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 +.align 5 +.L_M1_TL1: + KERNEL8xMx8 1, 0x04 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 +.align 5 +.L_M1_L71: + KERNEL1xMx8 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M1_L71 +.L_M1_L0: + SAVEMx8 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -1 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + PTR_SLLI T0, K, 5 + PTR_SLLI T1, LDC, 5 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x08 /* number of values in B */ +#endif + blt ZERO, J, .L_N8 + +.L_N7: + andi J, N, 4 + beq ZERO, J, .L_N3 +.L_N4: + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 +#if __loongarch_grlen == 64 + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#elif __loongarch_grlen == 32 + GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#else + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N4_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N4_M8 +.align 5 +.L_N4_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 /* B0 += 4 * OFF */ +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x4_START + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_L7 */ + beq ZERO,TL, .L_N4_M16_L7 +.align 5 +.L_N4_M16_TL1: /* TL-- */ + KERNEL8x16x4 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M16_TL1 +.L_N4_M16_L7: + /* if (!(L & 7)) goto L_N4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M16_L0 +.align 5 +.L_N4_M16_L71: + KERNEL1x16x4 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M16_L71 +.L_N4_M16_L0: + SAVE16x4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N4_M16 +.L_N4_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N4_M0 + + andi I, M, 8 + beq ZERO,I, .L_N4_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M8_L7 */ + beq ZERO,TL, .L_N4_M8_L7 +.align 5 +.L_N4_M8_TL1: /* TL-- */ + KERNEL8xMx4 8, 0x20 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M8_TL1 +.L_N4_M8_L7: + /* if (!(L & 7)) goto L_N4_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M8_L0 +.align 5 +.L_N4_M8_L71: + KERNEL1xMx4 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M8_L71 +.L_N4_M8_L0: + SAVEMx4 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M4: + andi I, M, 4 + beq ZERO,I, .L_N4_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M4_L7 */ + beq ZERO,TL, .L_N4_M4_L7 +.align 5 +.L_N4_M4_TL1: /* TL-- */ + KERNEL8xMx4 4, 0x10 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M4_TL1 +.L_N4_M4_L7: + /* if (!(L & 7)) goto L_N4_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M4_L0 +.align 5 +.L_N4_M4_L71: + KERNEL1xMx4 4, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M4_L71 +.L_N4_M4_L0: + SAVEMx4 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M2: + andi I, M, 2 + beq ZERO,I, .L_N4_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M2_L7 */ + beq ZERO,TL, .L_N4_M2_L7 +.align 5 +.L_N4_M2_TL1: /* TL-- */ + KERNEL8xMx4 2, 0x08 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M2_TL1 +.L_N4_M2_L7: + /* if (!(L & 7)) goto L_N4_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M2_L0 +.align 5 +.L_N4_M2_L71: + KERNEL1xMx4 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M2_L71 +.L_N4_M2_L0: + SAVEMx4 2, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M1: + andi I, M, 1 + beq ZERO,I, .L_N4_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M1_L7 */ + beq ZERO,TL, .L_N4_M1_L7 +.align 5 +.L_N4_M1_TL1: /* TL-- */ + KERNEL8xMx4 1, 0x04 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M1_TL1 +.L_N4_M1_L7: + /* if (!(L & 7)) goto L_N4_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M1_L0 +.align 5 +.L_N4_M1_L71: + KERNEL1xMx4 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M1_L71 +.L_N4_M1_L0: + SAVEMx4 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M0: + /* Add stride for B and C + * B += 4 * K + * C += 4 * LDC + */ + PTR_SLLI T0, K, 4 + PTR_SLLI T1, LDC, 4 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x04 +#endif + /* We must reinit I */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +.L_N2: + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 + PTR_ADD C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N2_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N2_M8 +.align 5 +.L_N2_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x2_START + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M16_L7 */ + beq ZERO,TL, .L_N2_M16_L7 +.align 5 +.L_N2_M16_TL1: /* TL-- */ + KERNEL8x16x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M16_TL1 +.L_N2_M16_L7: + /* if (!(L & 7)) goto L_N2_M16_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M16_L0 +.align 5 +.L_N2_M16_L71: + KERNEL1x16x2 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M16_L71 +.L_N2_M16_L0: + SAVE16x2 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N2_M16 +.L_N2_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N2_M0 + + andi I, M, 8 + beq ZERO,I, .L_N2_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M8_L7 */ + beq ZERO,TL, .L_N2_M8_L7 +.align 5 +.L_N2_M8_TL1: /* TL-- */ + KERNEL8xMx2 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M8_TL1 +.L_N2_M8_L7: + /* if (!(L & 7)) goto L_N2_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M8_L0 +.align 5 +.L_N2_M8_L71: + KERNEL1xMx2 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M8_L71 +.L_N2_M8_L0: + SAVEMx2 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M4: + andi I, M, 4 + beq ZERO,I, .L_N2_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M4_L7 */ + beq ZERO,TL, .L_N2_M4_L7 +.align 5 +.L_N2_M4_TL1: /* TL-- */ + KERNEL8xMx2 4, 0x10 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M4_TL1 +.L_N2_M4_L7: + /* if (!(L & 7)) goto L_N2_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M4_L0 +.align 5 +.L_N2_M4_L71: + KERNEL1xMx2 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M4_L71 +.L_N2_M4_L0: + SAVEMx2 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M2: + andi I, M, 2 + beq ZERO,I, .L_N2_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M2_L7 */ + beq ZERO,TL, .L_N2_M2_L7 +.align 5 +.L_N2_M2_TL1: /* TL-- */ + KERNEL8xMx2 2, 0x08 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M2_TL1 +.L_N2_M2_L7: + /* if (!(L & 7)) goto L_N2_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M2_L0 +.align 5 +.L_N2_M2_L71: + KERNEL1xMx2 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M2_L71 +.L_N2_M2_L0: + SAVEMx2 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M1: + andi I, M, 1 + beq ZERO,I, .L_N2_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M1_L7 */ + beq ZERO,TL, .L_N2_M1_L7 +.align 5 +.L_N2_M1_TL1: /* TL-- */ + KERNEL8xMx2 1, 0x04 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M1_TL1 +.L_N2_M1_L7: + /* if (!(L & 7)) goto L_N2_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M1_L0 +.align 5 +.L_N2_M1_L71: + KERNEL1xMx2 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M1_L71 +.L_N2_M1_L0: + SAVEMx2 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M0: + /* Add stride for B and C + * B += 2 * K + * C += 2 * LDC + */ + PTR_SLLI T0, K, 3 + PTR_SLLI T1, LDC, 3 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x02 +#endif + /* We must reinit I */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + /* if (!(M >> 4)) goto L_N1_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 +.L_N1_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x1_START + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M16_L7 */ + beq ZERO,TL, .L_N1_M16_L7 +.align 5 +.L_N1_M16_TL1: /* TL-- */ + KERNEL8x16x1 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M16_TL1 +.L_N1_M16_L7: + /* if (!(L & 7)) goto L_N1_M16_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M16_L0 +.align 5 +.L_N1_M16_L71: + KERNEL1x16x1 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M16_L71 +.L_N1_M16_L0: + SAVE16x1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_M16 +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 +.align 5 +.L_N1_M8_TL1: /* TL-- */ + KERNEL8xMx1 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 +.align 5 +.L_N1_M8_L71: + KERNEL1xMx1 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 +.L_N1_M8_L0: + SAVEMx1 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 +.align 5 +.L_N1_M4_TL1: /* TL-- */ + KERNEL8xMx1 4, 0x10 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 +.align 5 +.L_N1_M4_L71: + KERNEL1xMx1 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 +.L_N1_M4_L0: + SAVEMx1 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 +.align 5 +.L_N1_M2_TL1: /* TL-- */ + KERNEL8xMx1 2, 0x08 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 +.align 5 +.L_N1_M2_L71: + KERNEL1xMx1 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 +.L_N1_M2_L0: + SAVEMx1 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 +.align 5 +.L_N1_M1_TL1: /* TL-- */ + KERNEL8xMx1 1, 0x04 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 +.align 5 +.L_N1_M1_L71: + KERNEL1xMx1 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 +.L_N1_M1_L0: + SAVEMx1 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S new file mode 100644 index 000000000..266c07c5c --- /dev/null +++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S @@ -0,0 +1,463 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define S11 $r24 +#define S12 $r25 +#define S13 $r26 +#define S14 $r27 +#define S15 $r28 +#define S16 $r29 +#define TD $r30 +#define TS $r31 +#define TL $r7 +#define T0 $r6 +#undef ZERO +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + +// Loops outline +//.L_N16 <------------------- +//| .L_M8: | +//| .L_M7: | Main Loop +//| .L_M1: | +//| .L_M0: --------------- +//.L_N15: +//.L_N8: +//| .L_N8_M8: +//| .L_N8_M7: +//| .L_N8_M1: +//.L_N7: +//.L_N4: +//| .L_N4_M4: +//| .L_N4_M3: +//| .L_N4_M1: +//.L_N3: +//.L_N2: +//| .L_N2_M2: +//| .L_N2_M1: +//.L_N1: +//| .L_N1_M1: +//.L_N0 + + PROLOGUE + push_if_used 26, 32 + + move TD, DST + move TS, SRC + PTR_SLLI TL, LDA, 0x02 + PTR_SLLI T0, TL, 0x01 + PTR_SRAI J, N, 0x04 + beq J, ZERO, .L_N15 +.align 5 +.L_N16: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADDI J, J, -1 + PTR_ADD S4, S3, TL + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S9, S7, T0 + PTR_ADD S10, S8, T0 + PTR_ADD S11, S9, T0 + PTR_ADD S12, S10, T0 + PTR_ADD S13, S11, T0 + PTR_ADD S14, S12, T0 + PTR_ADD S15, S13, T0 + PTR_ADD S16, S14, T0 + PTR_ADD TS, S15, T0 + beq I, ZERO, .L_M7 +.align 5 +.L_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + xvld U8, S9, 0x00 + xvld U9, S10, 0x00 + xvld U10, S11, 0x00 + xvld U11, S12, 0x00 + xvld U12, S13, 0x00 + xvld U13, S14, 0x00 + xvld U14, S15, 0x00 + xvld U15, S16, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ + U8, U9, U10, U11, U12, U13, U14, U15, \ + U0, U1, U2, U3 // As tmp + GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ + D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ + D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI S9, S9, 0x20 + PTR_ADDI S10, S10, 0x20 + PTR_ADDI S11, S11, 0x20 + PTR_ADDI S12, S12, 0x20 + PTR_ADDI S13, S13, 0x20 + PTR_ADDI S14, S14, 0x20 + PTR_ADDI S15, S15, 0x20 + PTR_ADDI S16, S16, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M8 +.L_M7: + andi I, M, 0x07 + beq I, ZERO, .L_M0 +.align 5 +.L_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0C + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1C + + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI TD, TD, 0x20 + + fld.s F0, S9, 0x00 + fld.s F1, S10, 0x00 + fld.s F2, S11, 0x00 + fld.s F3, S12, 0x00 + fld.s F4, S13, 0x00 + fld.s F5, S14, 0x00 + fld.s F6, S15, 0x00 + fld.s F7, S16, 0x00 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0C + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1C + + PTR_ADDI S9, S9, 0x04 + PTR_ADDI S10, S10, 0x04 + PTR_ADDI S11, S11, 0x04 + PTR_ADDI S12, S12, 0x04 + PTR_ADDI S13, S13, 0x04 + PTR_ADDI S14, S14, 0x04 + PTR_ADDI S15, S15, 0x04 + PTR_ADDI S16, S16, 0x04 + PTR_ADDI TD, TD, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M1 +.L_M0: + blt ZERO, J, .L_N16 +.L_N15: + andi J, N, 0x0f + beq ZERO, J, .L_N0 + + andi J, N, 0x08 + beq ZERO, J, .L_N7 +.L_N8: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD TS, S7, T0 + beq I, ZERO, .L_N8_M7 +.align 5 +.L_N8_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ + D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N8_M8 +.L_N8_M7: + andi I, M, 0x07 + beq I, ZERO, .L_N7 +.align 5 +.L_N8_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + fst.s F4, TD, 0x10 + PTR_ADDI S5, S5, 0x04 + fst.s F5, TD, 0x14 + PTR_ADDI S6, S6, 0x04 + fst.s F6, TD, 0x18 + PTR_ADDI S7, S7, 0x04 + fst.s F7, TD, 0x1C + PTR_ADDI S8, S8, 0x04 + + PTR_ADDI TD, TD, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N8_M1 +.L_N7: + andi J, N, 0x07 + beq ZERO, J, .L_N0 + + andi J, N, 0x04 + beq ZERO, J, .L_N3 +.L_N4: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x02 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD TS, S3, T0 + beq I, ZERO, .L_N4_M3 +.align 5 +.L_N4_M4: + GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 + GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 + GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 + GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 + GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 + GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI TD, TD, 0x40 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M4 +.L_N4_M3: + andi I, M, 0x03 + beq I, ZERO, .L_N3 +.align 5 +.L_N4_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + + PTR_ADDI TD, TD, 0x10 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M1 +.L_N3: + andi J, N, 0x03 + beq ZERO, J, .L_N0 + + andi J, N, 0x02 + beq ZERO, J, .L_N1 +.L_N2: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x01 + PTR_ADD TS, S2, TL + beq I, ZERO, .L_N2_M1 +.align 5 +.L_N2_M2: + GLD f, d, F0, S1, 0x00, F1, S2, 0x00 + vilvl.w $vr0, $vr1, $vr0 + GST v, , $vr0, TD, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI TD, TD, 0x10 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N2_M2 +.L_N2_M1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI TD, TD, 0x08 +.align 5 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_N1_M1: + fld.s F0, S1, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F0, TD, 0x00 + PTR_ADDI TD, TD, 0x04 + PTR_ADDI M, M, -1 + blt ZERO, M, .L_N1_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..5c173568b --- /dev/null +++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S @@ -0,0 +1,298 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r6 +#undef ZERO +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 +#define D10 $xr17 +#define D12 $xr18 +#define D14 $xr19 + +// Loops outline +//.L_N8: <---------------- +//| .L_M8: | +//| .L_M7: | Main Loop +//| .L_M1: | +//| .L_M0:-------------- +//.L_N7: +//.L_N4: +//| .L_N4_M4: +//| .L_N4_M3: +//| .L_N4_M1: +//.L_N3: +//.L_N2: +//| .L_N2_M2: +//| .L_N2_M1: +//.L_N1: +//| .L_N1_M1: +//.L_N0 + + PROLOGUE + push_if_used 17, 20 + + move TD, DST + move TS, SRC + PTR_SLLI TL, LDA, 0x02 + PTR_SLLI T0, TL, 0x01 + PTR_SRAI J, N, 0x03 + beq J, ZERO, .L_N7 +.align 5 +.L_N8: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADDI J, J, -1 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD TS, S7, T0 + beq I, ZERO, .L_M7 +.align 5 +.L_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ + D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M8 +.L_M7: + andi I, M, 0x07 + beq I, ZERO, .L_M0 +.align 5 +.L_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + fst.s F4, TD, 0x10 + PTR_ADDI S5, S5, 0x04 + fst.s F5, TD, 0x14 + PTR_ADDI S6, S6, 0x04 + fst.s F6, TD, 0x18 + PTR_ADDI S7, S7, 0x04 + fst.s F7, TD, 0x1C + PTR_ADDI S8, S8, 0x04 + + PTR_ADDI TD, TD, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M1 +.L_M0: + blt ZERO, J, .L_N8 +.L_N7: + andi J, N, 0x07 + beq ZERO, J, .L_N0 + + andi J, N, 0x04 + beq ZERO, J, .L_N3 +.L_N4: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x02 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD TS, S3, T0 + beq I, ZERO, .L_N4_M3 +.align 5 +.L_N4_M4: + GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 + GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 + GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 + GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 + GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 + GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI TD, TD, 0x40 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M4 +.L_N4_M3: + andi I, M, 0x03 + beq I, ZERO, .L_N3 +.align 5 +.L_N4_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + + PTR_ADDI TD, TD, 0x10 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M1 +.L_N3: + andi J, N, 0x03 + beq ZERO, J, .L_N0 + + andi J, N, 0x02 + beq ZERO, J, .L_N1 +.L_N2: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x01 + PTR_ADD TS, S2, TL + beq I, ZERO, .L_N2_M1 +.align 5 +.L_N2_M2: + GLD f, d, F0, S1, 0x00, F1, S2, 0x00 + vilvl.w $vr0, $vr1, $vr0 + GST v, , $vr0, TD, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI TD, TD, 0x10 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N2_M2 +.L_N2_M1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI TD, TD, 0x08 +.align 5 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_N1_M1: + fld.s F0, S1, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F0, TD, 0x00 + PTR_ADDI TD, TD, 0x04 + PTR_ADDI M, M, -1 + blt ZERO, M, .L_N1_M1 +.L_N0: + pop_if_used 17, 20 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S new file mode 100644 index 000000000..d9789bdcd --- /dev/null +++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S @@ -0,0 +1,526 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + +// Loops outline +//.L_M8 <------------------- +//| .L_N16: | +//| .L_N15: | +//| .L_N8: | +//| .L_N7: | Main Loop +//| .L_N4: | +//| .L_N3: | +//| .L_N2: | +//| .L_N1: | +//| .L_N0: --------------- +//.L_M7 +//.L_M4 +//| .L_M4_N16: +//| .L_M4_N15: +//| .L_M4_N8: +//| .L_M4_N7: +//| .L_M4_N4: +//| .L_M4_N3: +//| .L_M4_N2: +//| .L_M4_N1: +//.L_M3 +//.L_M2 +//| .L_M2_N16: +//| .L_M2_N15: +//| .L_M2_N8: +//| .L_M2_N7: +//| .L_M2_N4: +//| .L_M2_N3: +//| .L_M2_N2: +//| .L_M2_N1: +//.L_M1 +//| .L_M1_N16: +//| .L_M1_N15: +//| .L_M1_N8: +//| .L_M1_N7: +//| .L_M1_N4: +//| .L_M1_N3: +//| .L_M1_N2: +//| .L_M1_N1: +//.L_M0 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + PTR_SRAI T0, N, 0x04 + PTR_SRAI T1, N, 0x03 + PTR_SLLI T0, T0, 0x04 + PTR_SLLI T1, T1, 0x03 + + PTR_MUL P2, M, T0 + PTR_MUL P3, M, T1 + PTR_SLLI P2, P2, 0x02 + PTR_SLLI P3, P3, 0x02 + PTR_ADD P2, DST, P2 + PTR_ADD P3, DST, P3 + + PTR_SRAI T0, N, 0x02 + PTR_SRAI T1, N, 0x01 + PTR_SLLI T0, T0, 0x02 + PTR_SLLI T1, T1, 0x01 + PTR_MUL P4, M, T0 + PTR_MUL P5, M, T1 + PTR_SLLI P4, P4, 0x02 + PTR_SLLI P5, P5, 0x02 + PTR_ADD P4, DST, P4 + PTR_ADD P5, DST, P5 + + PTR_SLLI TL, LDA, 0x02 + PTR_SRAI J, M, 0x03 + PTR_SLLI T0, TL, 0x01 + PTR_SLLI T1, M, 0x06 + beq ZERO, J, .L_M7 +.align 5 +.L_M8: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S0, S7, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x200 + + PTR_SRAI I, N, 0x04 + PTR_ADDI J, J, -1 + beq ZERO, I, .L_N15 +.L_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI S3, S3, 0x40 + PTR_ADDI S4, S4, 0x40 + PTR_ADDI S5, S5, 0x40 + PTR_ADDI S6, S6, 0x40 + PTR_ADDI S7, S7, 0x40 + PTR_ADDI S8, S8, 0x40 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_N16 +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 +.L_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ + U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI P2, P2, 0x100 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 +.L_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ + $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ + $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI S5, S5, 0x10 + PTR_ADDI S6, S6, 0x10 + PTR_ADDI S7, S7, 0x10 + PTR_ADDI S8, S8, 0x10 + PTR_ADDI P3, P3, 0x80 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 +.L_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ + $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI S5, S5, 0x08 + PTR_ADDI S6, S6, 0x08 + PTR_ADDI S7, S7, 0x08 + PTR_ADDI S8, S8, 0x08 + PTR_ADDI P4, P4, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ + $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI P5, P5, 0x20 +.L_N0: + blt ZERO, J, .L_M8 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 +.L_M4: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S0, S3, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x100 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M4_N15 +.align 5 +.L_M4_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI S3, S3, 0x40 + PTR_ADDI S4, S4, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M4_N16 +.L_M4_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M4_N7 +.L_M4_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI P2, P2, 0x80 +.L_M4_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M4_N3 +.L_M4_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI P3, P3, 0x40 +.L_M4_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M4_N1 +.L_M4_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI P4, P4, 0x20 +.L_M4_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI P5, P5, 0x10 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 +.L_M2: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S0, S0, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x80 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M2_N15 +.align 5 +.L_M2_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M2_N16 +.L_M2_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M2_N7 +.L_M2_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI P2, P2, 0x40 +.L_M2_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M2_N3 +.L_M2_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI P3, P3, 0x20 +.L_M2_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M2_N1 +.L_M2_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI P4, P4, 0x10 +.L_M2_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI P5, P5, 0x08 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + PTR_ADD S2, S0, TL + + move P1, P0 + PTR_ADDI P0, P0, 0x40 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M1_N15 +.align 5 +.L_M1_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M1_N16 +.L_M1_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M1_N7 +.L_M1_N8: + xvld U0, S1, 0x00 + + GST xv, , U0, P2, 0x00 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI P2, P2, 0x20 +.L_M1_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M1_N3 +.L_M1_N4: + GLD v, , $vr0, S1, 0x00 + GST v, , $vr0, P3, 0x00 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI P3, P3, 0x10 +.L_M1_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M1_N1 +.L_M1_N2: + GLD f, d, $f0, S1, 0x00 + GST f, d, $f0, P4, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI P4, P4, 0x08 +.L_M1_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + GLD f, s, $f0, S1, 0x00 + GST f, s, $f0, P5, 0x00 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI P5, P5, 0x04 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..725a47a60 --- /dev/null +++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S @@ -0,0 +1,406 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define T0 $r27 +#define T1 $r28 +#define TL $r7 +#undef ZERO +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + +// Loops outline +//.L_M8 <------------------- +//| .L_N8: | +//| .L_N7: | Main Loop +//| .L_N4: | +//| .L_N3: | +//| .L_N2: | +//| .L_N1: | +//| .L_N0: --------------- +//.L_M7 +//.L_M4 +//| .L_M4_N8: +//| .L_M4_N7: +//| .L_M4_N4: +//| .L_M4_N3: +//| .L_M4_N2: +//| .L_M4_N1: +//.L_M3 +//.L_M2 +//| .L_M2_N8: +//| .L_M2_N7: +//| .L_M2_N4: +//| .L_M2_N3: +//| .L_M2_N2: +//| .L_M2_N1: +//.L_M1 +//| .L_M1_N8: +//| .L_M1_N7: +//| .L_M1_N4: +//| .L_M1_N3: +//| .L_M1_N2: +//| .L_M1_N1: +//.L_M0 + + PROLOGUE + push_if_used 23, 8 + + move S0, SRC + move P0, DST + + PTR_SRAI T0, N, 0x04 + PTR_SRAI T1, N, 0x03 + PTR_SLLI T0, T0, 0x04 + PTR_SLLI T1, T1, 0x03 + + PTR_MUL P2, M, T1 + PTR_SLLI P2, P2, 0x02 + PTR_ADD P2, DST, P2 + PTR_SRAI T0, N, 0x02 + PTR_SRAI T1, N, 0x01 + PTR_SLLI T0, T0, 0x02 + PTR_SLLI T1, T1, 0x01 + PTR_MUL P3, M, T0 + PTR_MUL P4, M, T1 + PTR_SLLI P3, P3, 0x02 + PTR_SLLI P4, P4, 0x02 + PTR_ADD P3, DST, P3 + PTR_ADD P4, DST, P4 + + PTR_SLLI TL, LDA, 0x02 + PTR_SRAI J, M, 0x03 + PTR_SLLI T0, TL, 0x01 + PTR_SLLI T1, M, 0x05 + beq ZERO, J, .L_M7 +.align 5 +.L_M8: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S0, S7, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x100 + + PTR_SRAI I, N, 0x03 + PTR_ADDI J, J, -1 + beq ZERO, I, .L_N7 +.L_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ + U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_N8 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 +.L_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ + $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ + $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI S5, S5, 0x10 + PTR_ADDI S6, S6, 0x10 + PTR_ADDI S7, S7, 0x10 + PTR_ADDI S8, S8, 0x10 + PTR_ADDI P2, P2, 0x80 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 +.L_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ + $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI S5, S5, 0x08 + PTR_ADDI S6, S6, 0x08 + PTR_ADDI S7, S7, 0x08 + PTR_ADDI S8, S8, 0x08 + PTR_ADDI P3, P3, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ + $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI P4, P4, 0x20 +.L_N0: + blt ZERO, J, .L_M8 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 +.L_M4: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S0, S3, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x80 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M4_N7 +.align 5 +.L_M4_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M4_N8 +.L_M4_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M4_N3 +.L_M4_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI P2, P2, 0x40 +.L_M4_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M4_N1 +.L_M4_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI P3, P3, 0x20 +.L_M4_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI P4, P4, 0x10 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 +.L_M2: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S0, S0, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x40 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M2_N7 +.align 5 +.L_M2_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M2_N8 +.L_M2_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M2_N3 +.L_M2_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI P2, P2, 0x20 +.L_M2_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M2_N1 +.L_M2_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI P3, P3, 0x10 +.L_M2_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI P4, P4, 0x08 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + PTR_ADD S2, S0, TL + + move P1, P0 + PTR_ADDI P0, P0, 0x20 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M1_N7 +.align 5 +.L_M1_N8: + xvld U0, S1, 0x00 + + GST xv, , U0, P1, 0x00 + + PTR_ADDI S1, S1, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M1_N8 +.L_M1_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M1_N3 +.L_M1_N4: + GLD v, , $vr0, S1, 0x00 + GST v, , $vr0, P2, 0x00 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI P2, P2, 0x10 +.L_M1_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M1_N1 +.L_M1_N2: + GLD f, d, $f0, S1, 0x00 + GST f, d, $f0, P3, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI P3, P3, 0x08 +.L_M1_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + GLD f, s, $f0, S1, 0x00 + GST f, s, $f0, P4, 0x00 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI P4, P4, 0x04 +.L_M0: + pop_if_used 23, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S new file mode 100644 index 000000000..52ffc320e --- /dev/null +++ b/kernel/loongarch64/sgemv_n_8_lasx.S @@ -0,0 +1,463 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/30 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M4 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define A0 $xr11 +#define A1 $xr12 +#define A2 $xr13 +#define A3 $xr14 +#define A4 $xr15 +#define A5 $xr16 +#define A6 $xr17 +#define A7 $xr18 + +#define X0_F $f2 +#define X1_F $f3 +#define X2_F $f4 +#define X3_F $f5 +#define X4_F $f6 +#define X5_F $f7 +#define X6_F $f8 +#define X7_F $f9 +#define Y0_F $f10 +#define A0_F $f11 +#define A1_F $f12 +#define A2_F $f13 +#define A3_F $f14 +#define A4_F $f15 +#define A5_F $f16 +#define A6_F $f17 +#define A7_F $f18 + +.macro SLOAD_X_8 + GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C, \ + X4, X, 0x10, X5, X, 0x14, X6, X, 0x18, X7, X, 0x1C + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro SLOAD_X_8_GAP + xvldrepl.w X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.w X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X7, T0, 0x00 + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro SLOAD_X_4 + GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro SLOAD_X_4_GAP + xvldrepl.w X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.w X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X3, T0, 0x00 + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro SLOAD_X_2 + GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04 + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro SLOAD_X_2_GAP + xvldrepl.w X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.w X1, T0, 0x00 + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro SLOAD_X_1 + GLDREPL xv, w, X0, X, 0x00 + GMUL xvf, s, X0, X0, VALPHA +.endm + +.macro SLOAD_Y_8 + GLD xv, , Y0, Y, 0 +.endm + +.macro SLOAD_Y_8_GAP + fld.s Y0_F, Y, 0 + fldx.s A0_F, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.s A1_F, T0, 0 + fldx.s A2_F, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.s A3_F, T0, 0 + fldx.s A4_F, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.s A5_F, T0, 0 + fldx.s A6_F, T0, INC_Y + GINSVE0 xv, w, Y0, A0, 1, Y0, A1, 2, Y0, A2, 3, Y0, A3, 4, \ + Y0, A4, 5, Y0, A5, 6, Y0, A6, 7 +.endm + +.macro SLOAD_Y_1 + GLD f, s, Y0_F, Y, 0 +.endm + +.macro SGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0, \ + A2, PA2, 0, A3, PA3, 0, \ + A4, PA4, 0, A5, PA5, 0, \ + A6, PA6, 0, A7, PA7, 0 + GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \ + Y0, A2, X2, Y0, Y0, A3, X3, Y0, \ + Y0, A4, X4, Y0, Y0, A5, X5, Y0, \ + Y0, A6, X6, Y0, Y0, A7, X7, Y0 +.endm + +.macro SGEMV_N_1x8 + GLD_INC f, s, 0x04, \ + A0_F, PA0, 0, A1_F, PA1, 0, \ + A2_F, PA2, 0, A3_F, PA3, 0, \ + A4_F, PA4, 0, A5_F, PA5, 0, \ + A6_F, PA6, 0, A7_F, PA7, 0 + GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \ + Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F, \ + Y0_F, A4_F, X4_F, Y0_F, Y0_F, A5_F, X5_F, Y0_F, \ + Y0_F, A6_F, X6_F, Y0_F, Y0_F, A7_F, X7_F, Y0_F +.endm + +.macro SGEMV_N_8x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0, \ + A2, PA2, 0, A3, PA3, 0 + GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \ + Y0, A2, X2, Y0, Y0, A3, X3, Y0 +.endm + +.macro SGEMV_N_1x4 + GLD_INC f, s, 0x04, \ + A0_F, PA0, 0, A1_F, PA1, 0, \ + A2_F, PA2, 0, A3_F, PA3, 0 + GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \ + Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F +.endm + +.macro SGEMV_N_8x2 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0 + GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0 +.endm + +.macro SGEMV_N_1x2 + GLD_INC f, s, 0x04, \ + A0_F, PA0, 0, A1_F, PA1, 0 + GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F +.endm + +.macro SGEMV_N_1x1 + GLD_INC f, s, 0x04, A0_F, PA0, 0 + GMADD f, s, Y0_F, A0_F, X0_F, Y0_F +.endm + +.macro SSTORE_Y_8 + GST xv, , Y0, Y, 0 +.endm + +.macro SSTORE_Y_8_GAP + xvstelm.w Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.w Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 4 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 5 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 6 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 7 +.endm + +.macro SSTORE_Y_1 + GST f, s, Y0_F, Y, 0 +.endm + +.macro SGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M4 +.L_\XW\()_N_L8: + SLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + SLOAD_\Y_8 + SGEMV_N_8x8 + SSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + SLOAD_\Y_1 + SGEMV_N_1x8 + SSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + SLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + SLOAD_\Y_8 + SGEMV_N_8x4 + SSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + SLOAD_\Y_1 + SGEMV_N_1x4 + SSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M4 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + SLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + SLOAD_\Y_8 + SGEMV_N_8x2 + SSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + SLOAD_\Y_1 + SGEMV_N_1x2 + SSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M4 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD PA1, PA1, K_LDA + PTR_ALSL X, INC_X, X, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + SLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + SLOAD_\Y_1 + SGEMV_N_1x1 + SSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 19 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2 + xvreplve0.w VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + SGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + SGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + SGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 19 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemv_n_lsx.S b/kernel/loongarch64/sgemv_n_lsx.S new file mode 100644 index 000000000..3b1b519fc --- /dev/null +++ b/kernel/loongarch64/sgemv_n_lsx.S @@ -0,0 +1,227 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.w VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IX, $r0 + + move AO1, A //a_ptr + move XX, X + move YY, Y + + beq J, M, .L999 + +.L01: + vldx U0, XX, IX + vpermi.w U0, U0, 0x00 + + vfmul.s U1, VALPHA, U0 //temp1 + + move IY, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, M, 2 //n/4 + beq I, T0, .L03 + +.L02: + vldx U2, AO1, II + + move T1, IY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.s a1, YY, T1 + fldx.s a2, YY, T2 + fldx.s a3, YY, T3 + fldx.s a4, YY, T4 + + vextrins.w U3, U4, 0x10 + vextrins.w U3, U5, 0x20 + vextrins.w U3, U6, 0x30 + + vfmadd.s U3, U1, U2, U3 + + vextrins.w U4, U3, 0x01 + vextrins.w U5, U3, 0x02 + vextrins.w U6, U3, 0x03 + + fstx.s a1, YY, T1 + fstx.s a2, YY, T2 + fstx.s a3, YY, T3 + fstx.s a4, YY, T4 + + add.d IY, T4, INCY + addi.d II, II, 16 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: + andi T0, M, 2 + beq $r0, T0, .L04 + + addi.d T1, $r0, 4 + mod.d T1, M, T1 + sub.d II, M, T1 + slli.d II, II, BASE_SHIFT + + move T1, IY + add.d T2, T1, INCY + + fldx.s a1, AO1, II + addi.d T0, II, 4 + fldx.s a2, AO1, T0 + + fldx.s a3, YY, T1 + fldx.s a4, YY, T2 + + fmadd.s a3, $f12, a1, a3 + fmadd.s a4, $f12, a2, a4 + + fstx.s a3, YY, T1 + fstx.s a4, YY, T2 + + add.d IY, T2, INCY + +.L04: + andi T0, M, 1 + beq $r0, T0, .L05 + + addi.d II, M, -1 + slli.d II, II, BASE_SHIFT + + fldx.s a1, AO1, II + fldx.s a3, YY, IY + + fmadd.s a3, $f12, a1, a3 + + fstx.s a3, YY, IY + + add.d IY, IY, INCY + +.L05: + add.d AO1, AO1, LDA + add.d IX, IX, INCX + + addi.d J, J, 1 + blt J, N, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LD ALPHA, $sp, 72 + addi.d $sp, $sp, 80 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S new file mode 100644 index 000000000..f4bfffb42 --- /dev/null +++ b/kernel/loongarch64/sgemv_t_8_lasx.S @@ -0,0 +1,405 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/30 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M4 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define A0 $xr2 +#define A1 $xr3 +#define A2 $xr4 +#define A3 $xr5 +#define A4 $xr6 +#define A5 $xr7 +#define A6 $xr8 +#define A7 $xr9 +#define TP0 $xr10 +#define TP1 $xr11 +#define TP2 $xr12 +#define TP3 $xr13 +#define TP4 $xr14 +#define TP5 $xr15 +#define TP6 $xr16 +#define TP7 $xr17 +#define Y0 $xr2 +#define Y1 $xr3 +#define Y2 $xr4 +#define Y3 $xr5 +#define Y4 $xr6 +#define Y5 $xr7 +#define Y6 $xr8 +#define Y7 $xr9 + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y2 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro SLOAD_X8 + GLD xv, , X0, X, 0x00 +.endm + +.macro SLOAD_X8_GAP + fld.s $f1, X, 0x00 + fldx.s $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.s $f3, T0, 0x00 + fldx.s $f4, T0, INC_X + GINSVE0 xv, w, X0, A0, 1, X0, A1, 2, X0, A2, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.s $f2, T0, 0x00 + fldx.s $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.s $f4, T0, 0x00 + fldx.s $f5, T0, INC_X + GINSVE0 xv, w, X0, A0, 4, X0, A1, 5, X0, A2, 6, X0, A3, 7 +.endm + +.macro SGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0, \ + A2, PA2, 0, A3, PA3, 0, \ + A4, PA4, 0, A5, PA5, 0, \ + A6, PA6, 0, A7, PA7, 0 + GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \ + TP2, A2, X0, TP2, TP3, A3, X0, TP3, \ + TP4, A4, X0, TP4, TP5, A5, X0, TP5, \ + TP6, A6, X0, TP6, TP7, A7, X0, TP7 +.endm + +.macro SGEMV_T_4x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0, \ + A2, PA2, 0, A3, PA3, 0 + GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \ + TP2, A2, X0, TP2, TP3, A3, X0, TP3 +.endm + +.macro SGEMV_T_2x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0 + GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1 +.endm + +.macro SGEMV_T_LASX XW:req X8:req, X4:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M4 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + SLOAD_\X8 + SGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + // Accumulated + GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.s $f1, X, 0x00 + fld.s $f10, PA0, 0x00 + fld.s $f11, PA1, 0x00 + fld.s $f12, PA2, 0x00 + fld.s $f13, PA3, 0x00 + fld.s $f14, PA4, 0x00 + fld.s $f15, PA5, 0x00 + fld.s $f16, PA6, 0x00 + fld.s $f17, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ + PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ + PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 +#else + GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ + PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 +#endif + GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, \ + $f6, $f14, $f1, $f6, $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.s $f10, Y, 0x00 + fldx.s $f11, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.s $f12, PY0, 0x00 + fldx.s $f13, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.s $f14, PY1, 0x00 + fldx.s $f15, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.s $f16, PY2, 0x00 + fldx.s $f17, PY2, INC_Y + + GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, \ + $f14, ALPHA, $f6, $f14, $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.s $f10, Y, 0x00 + fstx.s $f11, Y, INC_Y + fst.s $f12, PY0, 0x00 + fstx.s $f13, PY0, INC_Y + fst.s $f14, PY1, 0x00 + fstx.s $f15, PY1, INC_Y + fst.s $f16, PY2, 0x00 + fstx.s $f17, PY2, INC_Y + + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + SLOAD_\X8 + SGEMV_T_4x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + // Accumulated + GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 7 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + fld.s $f1, X, 0x00 + GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00, $f12, PA2, 0x00, $f13, PA3, 0x00 + GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + fld.s $f10, Y, 0x00 + fldx.s $f11, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.s $f12, PY0, 0x00 + fldx.s $f13, PY0, INC_Y + + GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13 + + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M4 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.s $f10, Y, 0x00 + fstx.s $f11, Y, INC_Y + fst.s $f12, PY0, 0x00 + fstx.s $f13, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + SLOAD_\X8 + SGEMV_T_2x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + // Accumulated + GACC xvf, s, Y0, TP0, Y1, TP1 + andi I, M, 7 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + fld.s $f1, X, 0x00 + GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00 + GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + fld.s $f10, Y, 0x00 + fldx.s $f11, Y, INC_Y + + GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11 + + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M4 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + fst.s $f10, Y, 0x00 + fstx.s $f11, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.s $f2, PA0, 0x00 + fld.s $f1, X, 0x00 + fmadd.s $f10, $f2, $f1, $f10 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x04 + bnez I, .L_\XW\()_N_1_M_L1 + + fld.s $f2, Y, 0x00 + fmadd.s $f2, ALPHA, $f10, $f2 + fst.s $f2, Y, 0x00 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 18 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2 + xvreplve0.w VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + SGEMV_T_LASX GAP_0, X8, X4 +.L_GAP_1: /* if (incx != 1) */ + SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP +.L_END: + pop_if_used 17 + 8, 18 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemv_t_lsx.S b/kernel/loongarch64/sgemv_t_lsx.S new file mode 100644 index 000000000..b817f0bbf --- /dev/null +++ b/kernel/loongarch64/sgemv_t_lsx.S @@ -0,0 +1,275 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define AO3 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.w VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IY, $r0 + + move AO1, A //a_ptr1 + + srai.d T0, N, 2 //n/4 + beq J, T0, .L04 + +.L01: /* j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset1 + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TS, TS, 0x40 // a_offset + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + vld $vr0, TS, 0x00 + vst $vr0, TD, 0x00 + + + addi.d TS, TS, 0x10 // a_offset + addi.d TD, TD, 0x10 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_4_lasx.S b/kernel/loongarch64/zgemm_ncopy_4_lasx.S new file mode 100644 index 000000000..5d874bcc1 --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_4_lasx.S @@ -0,0 +1,320 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + xvand.v D4, U4, U4 + xvand.v D5, U5, U5 + xvand.v D6, U6, U6 + xvand.v D7, U7, U7 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D4, U6, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D6, U4, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D5, U7, 0x02 + xvpermi.q D3, U1, 0x31 + xvpermi.q D7, U5, 0x31 + + xvst D0, TD, 0x00 + xvst D4, TD, 0x20 + xvst D2, TD, 0x40 + xvst D6, TD, 0x60 + xvst D1, TD, 0x80 + xvst D5, TD, 0xa0 + xvst D3, TD, 0xc0 + xvst D7, TD, 0xe0 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d TD, TD, 0x100 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: + andi I, M, 0x02 + beq I, ZERO, .L_II20 + +.L_II1: /* if(m&2) */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U1, 0x02 + xvpermi.q D2, U3, 0x02 + xvpermi.q D1, U0, 0x31 + xvpermi.q D3, U2, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + +.L_II20: + andi I, M, 0x01 + beq I, ZERO, .L_J0 + +.L_II2: /* if(m&1) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + vst $vr2, TD, 0x20 + vst $vr3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvand.v D0, U0, U0 + + xvpermi.q D0, U1, 0x02 + xvpermi.q U1, U0, 0x31 + + xvst D0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + xvld U0, S1, 0x00 + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld $vr0, S1, 0x00 + vst $vr0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..203471cbd --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U8, TD, 0x20 + vst U12, TD, 0x30 + + vst U1, TD, 0x40 + vst U5, TD, 0x50 + vst U9, TD, 0x60 + vst U13, TD, 0x70 + + vst U2, TD, 0x80 + vst U6, TD, 0x90 + vst U10, TD, 0xa0 + vst U14, TD, 0xb0 + + vst U3, TD, 0xc0 + vst U7, TD, 0xd0 + vst U11, TD, 0xe0 + vst U15, TD, 0xf0 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d TD, TD, 0x100 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U4, TD, 0x20 + vst U6, TD, 0x30 + + vst U1, TD, 0x40 + vst U3, TD, 0x50 + vst U5, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U1, TD, 0x20 + vst U5, TD, 0x30 + + vst U2, TD, 0x40 + vst U6, TD, 0x50 + vst U3, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U1, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..7cd8f605b --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + fld.d F0, S5, 0x00 + fld.d F1, S5, 0x08 + fld.d F2, S6, 0x00 + fld.d F3, S6, 0x08 + fld.d F4, S7, 0x00 + fld.d F5, S7, 0x08 + fld.d F6, S8, 0x00 + fld.d F7, S8, 0x08 + + fst.d F0, TD, 0x40 + fst.d F1, TD, 0x48 + fst.d F2, TD, 0x50 + fst.d F3, TD, 0x58 + fst.d F4, TD, 0x60 + fst.d F5, TD, 0x68 + fst.d F6, TD, 0x70 + fst.d F7, TD, 0x78 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + + addi.d S1, S1, 0x10 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..3fe17beef --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S @@ -0,0 +1,212 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U2, S8, 0x20 + + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + xvst U1, S8, 0x00 + xvst U3, S8, 0x20 + + add.d S8, S8, T0 + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S9, S9, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + + slli.d T0, M, 0x05 + add.d TD, TD, T0 + + xvst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, TS, 0x00 + + vst $vr0, S9, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..1adee11c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S @@ -0,0 +1,302 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + xvst U4, S8, 0x80 + xvst U5, S8, 0xa0 + xvst U6, S8, 0xc0 + xvst U7, S8, 0xe0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + xvst U2, S9, 0x40 + xvst U3, S9, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + vst $vr2, S10, 0x20 + vst $vr3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + xvld U0, S1, 0x00 + + xvst U0, S9, 0x00 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, S1, 0x00 + + vst $vr0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..954753eaf --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S @@ -0,0 +1,355 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + vst U8, S8, 0x80 + vst U9, S8, 0x90 + vst U10, S8, 0xa0 + vst U11, S8, 0xb0 + vst U12, S8, 0xc0 + vst U13, S8, 0xd0 + vst U14, S8, 0xe0 + vst U15, S8, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + vst U4, S9, 0x40 + vst U5, S9, 0x50 + vst U6, S9, 0x60 + vst U7, S9, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + vst U2, S10, 0x20 + vst U3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld U0, S1, 0x00 + + vst U0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..f7440dc24 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x80 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + xvst U4, TD, 0x80 + xvst U5, TD, 0xa0 + xvst U6, TD, 0xc0 + xvst U7, TD, 0xe0 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x100 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + addi.d TD, TD, 0x80 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TD, TD, 0x40 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d TD, TD, 0x20 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemv_n_2_lsx.S b/kernel/loongarch64/zgemv_n_2_lsx.S new file mode 100644 index 000000000..efb376118 --- /dev/null +++ b/kernel/loongarch64/zgemv_n_2_lsx.S @@ -0,0 +1,296 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M16 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $vr1 +#define X0 $vr2 +#define X1 $vr3 +#define X2 $vr4 +#define X3 $vr5 +#define X4 $vr6 +#define X5 $vr7 +#define X6 $vr8 +#define X7 $vr9 +#define Y0 $vr10 +#define Y1 $vr11 +#define A0 $vr12 +#define A1 $vr13 +#define A2 $vr14 +#define A3 $vr15 +#define A4 $vr16 +#define A5 $vr17 +#define A6 $vr18 +#define A7 $vr19 +#define A8 $vr20 +#define A9 $vr21 +#define A10 $vr22 +#define A11 $vr23 +#define A12 $vr24 +#define A13 $vr25 +#define A14 $vr26 +#define A15 $vr27 +#define TMP0 $vr28 +#define TMP1 $vr29 +#define TMP2 $vr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro ZLOAD_X_2 + GLD v, , X0, X, 0x00, X1, X, 0x10 + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_2_GAP + vld X0, X, 0 + PTR_ADD T0, X, INC_X + vld X1, T0, 0 + + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_1 + GLD v, , X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_Y_2 + GLD v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro ZLOAD_Y_2_GAP + vld $vr10, Y, 0 + vldx $vr11, Y, INC_Y +.endm + +.macro ZLOAD_Y_1 + vld $vr10, Y, 0 +.endm + +.macro ZGEMV_N_2x2 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x2 + GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x1 + GLD_INC v, , 0x10, $vr12, PA0, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_2 + GST v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro ZSTORE_Y_2_GAP + vst Y0, Y, 0 + vstx Y1, Y, INC_Y +.endm + +.macro ZSTORE_Y_1 + vst $vr10, Y, 0 +.endm + +.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req + PTR_SRLI J, N, 1 + beqz J, .L_\XW\()_N_1 + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L2: + ZLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 1 + beqz I, .L_\XW\()_M_1 +.align 5 +.L_\XW\()_M_L2: + ZLOAD_\Y_2 + ZGEMV_N_2x2 + ZSTORE_\Y_2 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 1 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L2 +.L_\XW\()_M_1: + andi I, M, 1 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x2 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + PTR_ALSL X, INC_X, X, 1 + bnez J, .L_\XW\()_N_L2 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END +.L_\XW\()_N_L1: + ZLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x1 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M16 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + vpackev.d VALPHA, $vr1, $vr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA +#else + GADD , d, PA1, PA0, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S new file mode 100644 index 000000000..26edf1ed7 --- /dev/null +++ b/kernel/loongarch64/zgemv_n_4_lasx.S @@ -0,0 +1,343 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M16 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 +#define TMP0 $xr28 +#define TMP1 $xr29 +#define TMP2 $xr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro ZLOAD_X_4 + GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 + GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_4_GAP + xvld X0, X, 0 + xvpermi.q X0, X0, 0 + + PTR_ADD T0, X, INC_X + xvld X1, T0, 0 + xvpermi.q X1, X1, 0 + + PTR_ADD T0, T0, INC_X + xvld X2, T0, 0 + xvpermi.q X2, X2, 0 + + PTR_ADD T0, T0, INC_X + xvld X3, T0, 0 + xvpermi.q X3, X3, 0 + + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_Y_4 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro ZLOAD_Y_4_GAP + vld $vr10, Y, 0 + vldx $vr13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + vld $vr11, T0, 0 + vldx $vr14, T0, INC_Y + GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02 +.endm + +.macro ZGEMV_N_4x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_4 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro ZSTORE_Y_4_GAP + xvstelm.d Y0, Y, 0, 0 + xvstelm.d Y0, Y, 0x08, 1 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 2 + xvstelm.d Y0, T0, 0x08, 3 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + xvstelm.d Y1, T0, 0x08, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + xvstelm.d Y1, T0, 0x08, 3 +.endm + +.macro ZLOAD_Y_1 + vld $vr10, Y, 0 +.endm + +.macro ZGEMV_N_1x4 + GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_1 + vst $vr10, Y, 0 +.endm + +.macro ZLOAD_X_1 + GLD xv, , X0, X, 0x00 + GPERMI xv, q, X0, X0, 0 + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x1 + GLD_INC v, , 0x10, $vr12, PA0, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L4: + ZLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + ZLOAD_\Y_4 + ZGEMV_N_4x4 + ZSTORE_\Y_4 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x4 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END +.L_\XW\()_N_L1: + ZLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x1 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M16 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + xvpackev.d $xr0, $xr1, $xr0 + xvreplve0.q VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE + diff --git a/kernel/loongarch64/zgemv_t_2_lsx.S b/kernel/loongarch64/zgemv_t_2_lsx.S new file mode 100644 index 000000000..2a0fc172e --- /dev/null +++ b/kernel/loongarch64/zgemv_t_2_lsx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M16 $r30 + +#define VALPHA $vr0 +#define X0 $vr1 +#define X1 $vr2 +#define A0 $vr3 +#define A1 $vr4 +#define A2 $vr5 +#define A3 $vr6 +#define A4 $vr7 +#define A5 $vr8 +#define A6 $vr9 +#define A7 $vr10 +#define A8 $vr11 +#define A9 $vr12 +#define A10 $vr13 +#define A11 $vr14 +#define A12 $vr15 +#define A13 $vr16 +#define A14 $vr17 +#define A15 $vr18 +#define TP0 $vr19 +#define TP1 $vr20 +#define TP2 $vr21 +#define TP3 $vr22 +#define TP4 $vr23 +#define TP5 $vr24 +#define TP6 $vr25 +#define TP7 $vr26 +#define TMP0 $vr27 +#define TMP1 $vr28 +#define TMP2 $vr29 +#define Y0 $vr3 +#define Y1 $vr4 +#define Y2 $vr5 +#define Y3 $vr6 +#define Y4 $vr7 +#define Y5 $vr8 +#define Y6 $vr9 +#define Y7 $vr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y2 + GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR v, v, TP0, TP0, TP0 +.endm + +.macro ZLOAD_X2 + GLD v, , X0, X, 0x00, X1, X, 0x10 +.endm + +.macro ZLOAD_X2_GAP + vld X0, X, 0 + vldx X1, X, INC_X +.endm + +.macro ZGEMV_T_2x2 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_T_LSX XW:req, X2:req + PTR_SRLI J, N, 1 + beqz J, .L_\XW\()_N_1 + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L2: + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 1 + beqz I, .L_\XW\()_M_1 +.align 5 +.L_\XW\()_M_L2: + ZLOAD_\X2 + ZGEMV_T_2x2 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 1 + bnez I, .L_\XW\()_M_L2 +.L_\XW\()_M_1: + // Accumulated + GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1 + andi I, M, 1 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10 +#else + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + vld A8, Y, 0x00 + vldx A9, Y, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + vst $vr11, Y, 0x00 + vstx $vr12, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 + bnez J, .L_\XW\()_N_L2 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M16 +.L_\XW\()_N_L1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + GLD v, , A0, PA0, 0x00, X0, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x10 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + vld A0, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + vst $vr3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + vpackev.d VALPHA, $vr1, $vr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA +#else + GADD , d, PA1, PA0, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + ZGEMV_T_LSX GAP_0, X2 +.L_GAP_1: /* if (incx != 1) */ + ZGEMV_T_LSX GAP_1, X2_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/zgemv_t_4_lasx.S b/kernel/loongarch64/zgemv_t_4_lasx.S new file mode 100644 index 000000000..4d33b8f96 --- /dev/null +++ b/kernel/loongarch64/zgemv_t_4_lasx.S @@ -0,0 +1,299 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M16 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define TMP0 $xr27 +#define TMP1 $xr28 +#define TMP2 $xr29 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro ZLOAD_X4 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro ZLOAD_X4_GAP + xvld X0, X, 0 + + PTR_ADD T0, X, INC_X + xvld A0, T0, 0 + xvpermi.q X0, A0, 0x02 + + PTR_ADD T0, T0, INC_X + xvld X1, T0, 0 + + PTR_ADD T0, T0, INC_X + xvld A0, T0, 0 + xvpermi.q X1, A0, 0x02 +.endm + +.macro ZGEMV_T_4x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_T_LASX XW:req, X4:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L4: + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + ZLOAD_\X4 + ZGEMV_T_4x4 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 2 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + // Accumulated + GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#else + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + xvld A8, Y, 0x00 + xvldx A9, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + xvld A10, PY0, 0x00 + xvldx A11, PY0, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + vst $vr11, Y, 0x00 + vstx $vr12, Y, INC_Y + vst $vr13, PY0, 0x00 + vstx $vr14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M16 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + GLD xv, , A0, PA0, 0x00, X0, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x10 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + xvld A0, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + vst $vr3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + xvpackev.d $xr0, $xr1, $xr0 + xvreplve0.q VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + ZGEMV_T_LASX GAP_0, X4 +.L_GAP_1: /* if (incx != 1) */ + ZGEMV_T_LASX GAP_1, X4_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S index 49f640268..8e2165ab7 100644 --- a/kernel/loongarch64/znrm2.S +++ b/kernel/loongarch64/znrm2.S @@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 move XX, X MOV s2, s1 srai.d I, N, 2 diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S new file mode 100644 index 000000000..53f8a6e05 --- /dev/null +++ b/kernel/loongarch64/znrm2_lasx.S @@ -0,0 +1,252 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM2 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S new file mode 100644 index 000000000..14c59d504 --- /dev/null +++ b/kernel/loongarch64/znrm2_lsx.S @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a6e06d67..c37b88adb 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -35,7 +35,7 @@ DSUMKERNEL = ../mips/sum.c CSUMKERNEL = ../mips/zsum.c ZSUMKERNEL = ../mips/zsum.c -ifdef HAVE_MSA +ifndef NO_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c @@ -47,7 +47,7 @@ CASUMKERNEL = ../mips/zasum.c ZASUMKERNEL = ../mips/zasum.c endif -ifdef HAVE_MSA +ifndef NO_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c @@ -59,7 +59,7 @@ CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c endif -ifdef HAVE_MSA +ifndef NO_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c @@ -71,7 +71,7 @@ CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c endif -ifdef HAVE_MSA +ifndef NO_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -88,7 +88,7 @@ DNRM2KERNEL = ../mips/nrm2.c CNRM2KERNEL = ../mips/znrm2.c ZNRM2KERNEL = ../mips/znrm2.c -ifdef HAVE_MSA +ifndef NO_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c @@ -100,11 +100,13 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c -CSCALKERNEL = ../mips/cscal_msa.c -ZSCALKERNEL = ../mips/zscal_msa.c +#CSCALKERNEL = ../mips/cscal_msa.c +#ZSCALKERNEL = ../mips/zscal_msa.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c else SSCALKERNEL = ../mips/scal.c DSCALKERNEL = ../mips/scal.c @@ -112,7 +114,7 @@ CSCALKERNEL = ../mips/zscal.c ZSCALKERNEL = ../mips/zscal.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c @@ -124,7 +126,7 @@ CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c CGEMVNKERNEL = ../mips/cgemv_n_msa.c @@ -136,7 +138,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c ZGEMVNKERNEL = ../mips/zgemv_n.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVTKERNEL = ../mips/sgemv_t_msa.c DGEMVTKERNEL = ../mips/dgemv_t_msa.c CGEMVTKERNEL = ../mips/cgemv_t_msa.c @@ -148,7 +150,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c ZGEMVTKERNEL = ../mips/zgemv_t.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -162,7 +164,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -180,7 +182,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -198,7 +200,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -212,7 +214,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -224,7 +226,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -236,7 +238,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -248,7 +250,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c index fcff09337..8cc189fe3 100644 --- a/kernel/mips/nrm2.c +++ b/kernel/mips/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/mips/znrm2.c b/kernel/mips/znrm2.c index 85be39cd1..d11a6bd4a 100644 --- a/kernel/mips/znrm2.c +++ b/kernel/mips/znrm2.c @@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index bca1155c1..7bb261941 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -47,6 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } @@ -63,8 +64,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } - x[ip] = temp; - + if ( da_r != da_r ) + x[ip] = da_r; + else + x[ip] = temp; + ip += inc_x2; } diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 index b81e5441d..1149d97f1 100644 --- a/kernel/mips64/KERNEL.LOONGSON3R4 +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -1,4 +1,4 @@ -ifdef HAVE_MSA +ifndef NO_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c @@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S DAXPYKERNEL = daxpy_loongson3a_simd.S endif -ifdef HAVE_MSA +ifndef NO_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c ZCOPYKERNEL = ../mips/zcopy_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -23,21 +23,21 @@ ZDOTKERNEL = ../mips/zdot_msa.c endif DSDOTKERNEL = ../mips/dot.c -ifdef HAVE_MSA +ifndef NO_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c ZROTKERNEL = ../mips/zrot_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c CSCALKERNEL = ../mips/cscal_msa.c ZSCALKERNEL = ../mips/zscal_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c SGEMVTKERNEL = ../mips/sgemv_t_msa.c @@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c endif -ifdef HAVE_MSA +ifndef NO_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c ZASUMKERNEL = ../mips/zasum_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c ZSWAPKERNEL = ../mips/zswap_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -89,7 +89,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -129,7 +129,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S index 76fa9c295..159f9bea9 100644 --- a/kernel/mips64/cnrm2.S +++ b/kernel/mips64/cnrm2.S @@ -77,7 +77,7 @@ blez N, .L999 mov.d s2, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT dsra I, N, 2 diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S index cd40414a2..1b55d9fc3 100644 --- a/kernel/mips64/dnrm2.S +++ b/kernel/mips64/dnrm2.S @@ -81,7 +81,7 @@ blez N, .L999 MTC $0, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, BASE_SHIFT move XX, X diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S index 1ba061a7d..f18151b5c 100644 --- a/kernel/mips64/snrm2.S +++ b/kernel/mips64/snrm2.S @@ -77,7 +77,7 @@ blez N, .L999 mov.d s2, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S index 1c247bca9..d33284a47 100644 --- a/kernel/mips64/znrm2.S +++ b/kernel/mips64/znrm2.S @@ -80,7 +80,7 @@ blez N, .L999 MTC $0, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT move XX, X diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 79d889fe0..a1cdc8639 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -1,6 +1,3 @@ -ifeq ($(HAVE_GAS), 1) -include $(KERNELDIR)/KERNEL.POWER8 -else #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -19,11 +16,11 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c -CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power10.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c SGEMMKERNEL = sgemm_kernel_power10.c -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S @@ -62,10 +59,14 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c -CGEMMKERNEL = cgemm_kernel_power10.S +CGEMMKERNEL = cgemm_kernel_power10.c #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ifeq ($(OSNAME), AIX) +CGEMMITCOPY = cgemm_tcopy_8_power8.S +else CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +endif CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) @@ -73,7 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_power10.S +ZGEMMKERNEL = zgemm_kernel_power10.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c @@ -124,6 +125,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # +ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGTEQ9),1) ISAMAXKERNEL = isamax_power9.S else @@ -148,6 +150,15 @@ ICAMINKERNEL = icamin_power9.S else ICAMINKERNEL = icamin.c endif +else +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +endif IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -238,4 +249,3 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c -endif diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 2b8e65948..700a68e44 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -50,7 +50,7 @@ CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index b6b102b3e..7d007d1a2 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -13,7 +13,7 @@ CTRMMKERNEL = cgemm_kernel_power9.S ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index a99fb7d96..fee5fa529 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,11 +1,11 @@ ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMMKERNEL = gemm_kernel.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) else diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 1bdd3119e..c73601cee 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -96,9 +96,9 @@ CGEMMINCOPY = CGEMMONCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = +CGEMMINCOPYOBJ = #cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = #cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c new file mode 100644 index 000000000..233768cef --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.c @@ -0,0 +1,1115 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined (TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define COMP_MUL_1 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) + +#define COMP_MAC_1(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ +} + +#define COMP_MUL_2A \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) + +#define COMP_MAC_2A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ +} + +#define COMP_MUL_2B \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) + +#define COMP_MAC_2B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ +} + +#define COMP_MUL_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MAC_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MUL_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + +#define COMP_MAC_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_1 \ + COMP_MAC_1(16) \ + COMP_MAC_1(32) \ + COMP_MAC_1(48) \ + COMP_MAC_1(64) \ + COMP_MAC_1(80) \ + COMP_MAC_1(96) \ + COMP_MAC_1(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2A \ + COMP_MAC_2A(16) \ + COMP_MAC_2A(32) \ + COMP_MAC_2A(48) \ + COMP_MAC_2A(64) \ + COMP_MAC_2A(80) \ + COMP_MAC_2A(96) \ + COMP_MAC_2A(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2B \ + COMP_MAC_2B(16) \ + COMP_MAC_2B(32) \ + COMP_MAC_2B(48) \ + COMP_MAC_2B(64) \ + COMP_MAC_2B(80) \ + COMP_MAC_2B(96) \ + COMP_MAC_2B(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(32) \ + COMP_MAC_4A(64) \ + COMP_MAC_4A(96) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(64) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4A(32) \ + COMP_MAC_4A(96) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(32) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(48) \ + CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_24_ALL \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4); \ + __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1); \ + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc5); \ + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc2); \ + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc6); \ + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc3); \ + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); \ + COMP_MUL(tr[ 0], res[ 0], res[ 5], ti[ 0], res[ 1], res[ 4]) \ + COMP_MUL(tr[ 1], res[ 8], res[ 13], ti[ 1], res[ 9], res[ 12]) \ + COMP_MUL(tr[ 2], res[ 2], res[ 7], ti[ 2], res[ 3], res[ 6]) \ + COMP_MUL(tr[ 3], res[ 10], res[ 15], ti[ 3], res[ 11], res[ 14]) \ + COMP_MUL(tr[ 4], res[ 16], res[ 21], ti[ 4], res[ 17], res[ 20]) \ + COMP_MUL(tr[ 5], res[ 24], res[ 29], ti[ 5], res[ 25], res[ 28]) \ + COMP_MUL(tr[ 6], res[ 18], res[ 23], ti[ 6], res[ 19], res[ 22]) \ + COMP_MUL(tr[ 7], res[ 26], res[ 31], ti[ 7], res[ 27], res[ 30]) \ + COMP_MUL(tr[ 8], res[ 32], res[ 37], ti[ 8], res[ 33], res[ 36]) \ + COMP_MUL(tr[ 9], res[ 40], res[ 45], ti[ 9], res[ 41], res[ 44]) \ + COMP_MUL(tr[10], res[ 34], res[ 39], ti[10], res[ 35], res[ 38]) \ + COMP_MUL(tr[11], res[ 42], res[ 47], ti[11], res[ 43], res[ 46]) \ + COMP_MUL(tr[12], res[ 48], res[ 53], ti[12], res[ 49], res[ 52]) \ + COMP_MUL(tr[13], res[ 56], res[ 61], ti[13], res[ 57], res[ 60]) \ + COMP_MUL(tr[14], res[ 50], res[ 55], ti[14], res[ 51], res[ 54]) \ + COMP_MUL(tr[15], res[ 58], res[ 63], ti[15], res[ 59], res[ 62]) \ + COMP_MUL(tr[16], res[ 64], res[ 69], ti[16], res[ 65], res[ 68]) \ + COMP_MUL(tr[17], res[ 72], res[ 77], ti[17], res[ 73], res[ 76]) \ + COMP_MUL(tr[18], res[ 66], res[ 71], ti[18], res[ 67], res[ 70]) \ + COMP_MUL(tr[19], res[ 74], res[ 79], ti[19], res[ 75], res[ 78]) \ + COMP_MUL(tr[20], res[ 80], res[ 85], ti[20], res[ 81], res[ 84]) \ + COMP_MUL(tr[21], res[ 88], res[ 93], ti[21], res[ 89], res[ 92]) \ + COMP_MUL(tr[22], res[ 82], res[ 87], ti[22], res[ 83], res[ 86]) \ + COMP_MUL(tr[23], res[ 90], res[ 95], ti[23], res[ 91], res[ 94]) \ + COMP_MUL(tr[24], res[ 96], res[101], ti[24], res[ 97], res[100]) \ + COMP_MUL(tr[25], res[104], res[109], ti[25], res[105], res[108]) \ + COMP_MUL(tr[26], res[ 98], res[103], ti[26], res[ 99], res[102]) \ + COMP_MUL(tr[27], res[106], res[111], ti[27], res[107], res[110]) \ + COMP_MUL(tr[28], res[112], res[117], ti[28], res[113], res[116]) \ + COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124]) \ + COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118]) \ + COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126]) \ + CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ + CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ + CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ + CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ + CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ + CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ + CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ + CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ + CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ + CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ + CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ + CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ + CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ + CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ + CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ + CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ + CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ + CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ + CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ + CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ + CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ + CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ + CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ + CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ + CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ + CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ + CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ + CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ + CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ + CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ + CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ + CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ + CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ + CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ + CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ + CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ + CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ + CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ + CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ + CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ + CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ + CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ + CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ + CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ + CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ + CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ + CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ + CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ + CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ + CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ + CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ + CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ + CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ + CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; + +#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ + COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ + COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ + COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ + COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ + COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ + COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ + COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_14 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ + COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ + COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ + COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ + COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ + COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ + COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ + COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ + COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ + COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ + COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ + COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ + COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ + COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ + COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +#ifdef TRMMKERNEL +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) +#else +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[64], ti[64]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 2); i1++) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 3; + + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); + } + SAVE_ACC_COMPLEX_24_ALL + CO += 16; + AO += temp << 4; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_14 + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 3; + } + + if (n & 2) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 2; + + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_22_4 + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_22_2 + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_12 + AO += temp<<1; + BO += temp<<2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + + if (n & 1) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 1; + + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp<<1; + BO += temp<<1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S index c115650fd..74117a831 100644 --- a/kernel/power/cnrm2.S +++ b/kernel/power/cnrm2.S @@ -99,7 +99,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S index 46c29c654..0d036b32f 100644 --- a/kernel/power/cnrm2_hummer.S +++ b/kernel/power/cnrm2_hummer.S @@ -119,7 +119,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) andi. r0, X, 2 * SIZE - 1 bne LL(100) diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S index c71c34b7c..8e3abf9f9 100644 --- a/kernel/power/cnrm2_ppc440.S +++ b/kernel/power/cnrm2_ppc440.S @@ -104,7 +104,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) fmr f0, f1 sub X, X, INCX diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c index ecdc3e5c6..73f6d5b99 100644 --- a/kernel/power/dgemm_small_kernel_nn_power10.c +++ b/kernel/power/dgemm_small_kernel_nn_power10.c @@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t; #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); -#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))) #if defined(_AIX) #define LOAD_PAIR(pair, v0, v1) \ __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c index 93a942b02..426948185 100644 --- a/kernel/power/dgemm_small_kernel_tn_power10.c +++ b/kernel/power/dgemm_small_kernel_tn_power10.c @@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t; #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); -#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))) #if defined(_AIX) #define LOAD_PAIR(pair, v0, v1) \ __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index fc52e0202..7a3c04e8e 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -107,6 +107,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "dgemm_tcopy_macros_16_power10.S" +#endif #include "dgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/dgemm_tcopy_macros_16_power10.S b/kernel/power/dgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..3cb416e09 --- /dev/null +++ b/kernel/power/dgemm_tcopy_macros_16_power10.S @@ -0,0 +1,655 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs40, o0, A1 + lxvpx vs42, o32, A1 + addi A1, A1, 64 + + lxvpx vs48, o0, A2 + lxvpx vs50, o32, A2 + addi A2, A2, 64 + + lxvpx vs4, o0, A3 + lxvpx vs6, o32, A3 + addi A3, A3, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o32, A0 + addi A0, A0, 64 + + lxvpx vs44, o0, A1 + lxvpx vs46, o32, A1 + addi A1, A1, 64 + + lxvpx vs12, o0, A2 + lxvpx vs2, o32, A2 + addi A2, A2, 64 + + lxvpx vs8, o0, A3 + lxvpx vs10, o32, A3 + addi A3, A3, 64 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 +#else + stxvd2x vs49, o0, T1 + stxvd2x vs48, o16, T1 + stxvd2x vs51, o32, T1 + stxvd2x vs50, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs12, o0, T1 + stxvd2x vs13, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 +#else + stxvd2x vs13, o0, T1 + stxvd2x vs12, o16, T1 + stxvd2x vs3, o32, T1 + stxvd2x vs2, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs4, o0, T1 + stxvd2x vs5, o16, T1 + stxvd2x vs6, o32, T1 + stxvd2x vs7, o48, T1 +#else + stxvd2x vs5, o0, T1 + stxvd2x vs4, o16, T1 + stxvd2x vs7, o32, T1 + stxvd2x vs6, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 +#else + stxvd2x vs9, o0, T1 + stxvd2x vs8, o16, T1 + stxvd2x vs11, o32, T1 + stxvd2x vs10, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + addi A1, A1, 64 + + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + addi A2, A2, 64 + + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + addi A3, A3, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x4', ` +#else +.macro COPY_4x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + lxvpx vs34, o0, A1 + addi A1, A1, 32 + + + lxvpx vs36, o0, A2 + addi A2, A2, 32 + + + lxvpx vs38, o0, A3 + addi A3, A3, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o32, A0 + addi A0, A0, 64 + + + lxvpx vs40, o0, A1 + lxvpx vs42, o32, A1 + addi A1, A1, 64 + + lxvpx vs44, o0, A1 + lxvpx vs46, o32, A1 + addi A1, A1, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + lxvpx vs36, o0, A1 + lxvpx vs38, o0, A1 + addi A1, A1, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x4', ` +#else +.macro COPY_2x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + lxvpx vs34, o0, A1 + addi A1, A1, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o0, A0 + addi A0, A0, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x4', ` +#else +.macro COPY_1x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index 6c5b8ed62..93b488fd0 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -275,6 +276,7 @@ define(`COPY_4x4', ` #else .endm #endif +#endif /********************************************************************************************** @@ -369,6 +371,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -512,6 +515,7 @@ define(`COPY_2x4', ` #else .endm #endif +#endif /********************************************************************************************** @@ -580,6 +584,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -675,6 +680,7 @@ define(`COPY_1x4', ` #else .endm #endif +#endif /********************************************************************************************** diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S index 4931f5ab1..8638ca424 100644 --- a/kernel/power/dnrm2_hummer.S +++ b/kernel/power/dnrm2_hummer.S @@ -134,7 +134,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) mr XX, X diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S index 849ca1f35..529f6adf0 100644 --- a/kernel/power/dnrm2_ppc440.S +++ b/kernel/power/dnrm2_ppc440.S @@ -111,7 +111,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) mr NN, N mr XX, X diff --git a/kernel/power/lock.c b/kernel/power/lock.c index 1c1b006b0..de28680d8 100644 --- a/kernel/power/lock.c +++ b/kernel/power/lock.c @@ -36,7 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ #ifdef __GNUC__ diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S index d9e1f4e9a..880b5d1b4 100644 --- a/kernel/power/nrm2.S +++ b/kernel/power/nrm2.S @@ -113,7 +113,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) mr NN, N mr XX, X diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index 134929ec1..c3fa67cf6 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -336,7 +336,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (m & 1) { IFLOAT *BO = B; - v2sf_t *rowC; v4sf_t result[4], res[4]; __vector_quad acc0, acc1; __builtin_mma_xxsetaccz (&acc0); @@ -492,7 +491,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[l << 1]); - vec_t *rowB = (vec_t *) & (BO[l]); vector short rowB_mrg = { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); @@ -570,7 +568,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (m & 1) { IFLOAT *BO = B; - v2sf_t *rowC; v4sf_t result[4], res[4]; __vector_quad acc0; BLASLONG l = 0; diff --git a/kernel/power/sgemm_ncopy_16_power.c b/kernel/power/sgemm_ncopy_16_power.c new file mode 100755 index 000000000..babe1376e --- /dev/null +++ b/kernel/power/sgemm_ncopy_16_power.c @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp19 ; + IFLOAT ctemp21, ctemp23 ; + IFLOAT ctemp25, ctemp27 ; + IFLOAT ctemp29, ctemp31 ; + + aoffset = a; + boffset = b; + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float c9, c10, c11, c12, c13, c14, c15, c16; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + vector float t9, t10, t11, t12; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + c9 = vec_xl(0, aoffset9); + c10 = vec_xl(0, aoffset10); + c11 = vec_xl(0, aoffset11); + c12 = vec_xl(0, aoffset12); + c13 = vec_xl(0, aoffset13); + c14 = vec_xl(0, aoffset14); + c15 = vec_xl(0, aoffset15); + c16 = vec_xl(0, aoffset16); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + t9 = vec_mergeh(c9, c10); + t10 = vec_mergeh(c11, c12); + t11 = vec_mergeh(c13, c14); + t12 = vec_mergeh(c15, c16); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t9, t10, 0b00); + t8 = vec_xxpermdi(t11, t12, 0b00); + + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + t5 = vec_xxpermdi(t1, t2, 0b11); + t6 = vec_xxpermdi(t3, t4, 0b11); + t7 = vec_xxpermdi(t9, t10, 0b11); + t8 = vec_xxpermdi(t11, t12, 0b11); + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + t9 = vec_mergel(c9, c10); + t10 = vec_mergel(c11, c12); + t11 = vec_mergel(c13, c14); + t12 = vec_mergel(c15, c16); + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t9, t10, 0b00); + t8 = vec_xxpermdi(t11, t12, 0b00); + vec_xst(t5, 0, boffset+32); + vec_xst(t6, 0, boffset+36); + vec_xst(t7, 0, boffset+40); + vec_xst(t8, 0, boffset+44); + + t5 = vec_xxpermdi(t1, t2, 0b11); + t6 = vec_xxpermdi(t3, t4, 0b11); + t7 = vec_xxpermdi(t9, t10, 0b11); + t8 = vec_xxpermdi(t11, t12, 0b11); + vec_xst(t5, 0, boffset+48); + vec_xst(t6, 0, boffset+52); + vec_xst(t7, 0, boffset+56); + vec_xst(t8, 0, boffset+60); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + aoffset7 += 4; + aoffset8 += 4; + + aoffset9 += 4; + aoffset10 += 4; + aoffset11 += 4; + aoffset12 += 4; + aoffset13 += 4; + aoffset14 += 4; + aoffset15 += 4; + aoffset16 += 4; + boffset += 64; + + i --; + }while(i > 0); + } + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + ctemp17 = *(aoffset9 + 0); + ctemp19 = *(aoffset10 + 0); + ctemp21 = *(aoffset11 + 0); + ctemp23 = *(aoffset12 + 0); + ctemp25 = *(aoffset13 + 0); + ctemp27 = *(aoffset14 + 0); + ctemp29 = *(aoffset15 + 0); + ctemp31 = *(aoffset16 + 0); + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + aoffset5+=1; + aoffset6+=1; + aoffset7+=1; + aoffset8+=1; + aoffset9+=1; + aoffset10+=1; + aoffset11+=1; + aoffset12+=1; + aoffset13+=1; + aoffset14+=1; + aoffset15+=1; + aoffset16+=1; + boffset += 16; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t1, t2, 0b11); + t8 = vec_xxpermdi(t3, t4, 0b11); + + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t1, t2, 0b11); + t8 = vec_xxpermdi(t3, t4, 0b11); + + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + aoffset7 += 4; + aoffset8 += 4; + + boffset += 32; + i--; + }while(i > 0); + } + + i = (m & 3); + if (i > 0) { + do { + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + aoffset5+=1; + aoffset6+=1; + aoffset7+=1; + aoffset8+=1; + + boffset += 8; + i--; + } while (i > 0); + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4; + vector float t1, t2, t3, t4; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + + t3 = vec_xxpermdi(t1, t2, 0b00); + t4 = vec_xxpermdi(t1, t2, 0b11); + + vec_xst(t3, 0, boffset); + vec_xst(t4, 0, boffset+4); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + + t3 = vec_xxpermdi(t1, t2, 0b00); + t4 = vec_xxpermdi(t1, t2, 0b11); + + vec_xst(t3, 0, boffset+8); + vec_xst(t4, 0, boffset+12); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + + boffset += 16; + i--; + }while(i > 0); + } + + i = (m & 3); + if (i > 0) { + do { + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + + boffset += 4; + i--; + } while (i > 0); + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S index b9f6d63fb..6d2c6a555 100644 --- a/kernel/power/sgemm_tcopy_16_power8.S +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "sgemm_tcopy_macros_16_power10.S" +#endif #include "sgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/sgemm_tcopy_macros_16_power10.S b/kernel/power/sgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..dca37e48a --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_16_power10.S @@ -0,0 +1,323 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs40, o0, T1 + stxvx vs41, o16, T1 + stxvx vs42, o32, T1 + stxvx vs43, o48, T1 +#else + stxvx vs41, o0, T1 + stxvx vs40, o16, T1 + stxvx vs43, o32, T1 + stxvx vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs44, o0, T1 + stxvx vs45, o16, T1 + stxvx vs46, o32, T1 + stxvx vs47, o48, T1 +#else + stxvx vs45, o0, T1 + stxvx vs44, o16, T1 + stxvx vs47, o32, T1 + stxvx vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + lxvpx vs36, o0, A2 + + lxvpx vs38, o0, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index ed592a604..af237d5ee 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -141,6 +142,7 @@ define(`COPY_4x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=4 and M=4 @@ -264,6 +266,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -329,6 +332,7 @@ define(`COPY_2x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=2 and M=4 @@ -418,6 +422,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -465,6 +470,7 @@ define(`COPY_1x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=1 and M=4 diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S index be974cc48..696d404bb 100644 --- a/kernel/power/snrm2.S +++ b/kernel/power/snrm2.S @@ -97,7 +97,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S index a0ff3d1b2..a4292af78 100644 --- a/kernel/power/snrm2_hummer.S +++ b/kernel/power/snrm2_hummer.S @@ -119,7 +119,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) cmpwi cr0, INCX, SIZE bne LL(100) diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S index 0a80d1224..3547d7f47 100644 --- a/kernel/power/snrm2_ppc440.S +++ b/kernel/power/snrm2_ppc440.S @@ -105,7 +105,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c new file mode 100644 index 000000000..370d12af3 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.c @@ -0,0 +1,736 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined(TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ + COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ + COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ + COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ + COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ + COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ + COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ + COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ + COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +#ifdef TRMMKERNEL +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) +#else +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[16], ti[16]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 1); i1++) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<2; + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); + } + __builtin_mma_disassemble_acc ((void *)result, &acc0); + __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); + __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); + COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) + COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) + COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) + COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) + COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) + COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) + COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) + COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) + COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) + COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) + COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) + COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) + COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) + COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) + COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) + COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; + CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; + CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; + CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; + CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; + CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; + CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; + CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; + CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; + CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; + CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; + CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; + + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_12 + AO += temp << 1; + BO += temp << 2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + if (n & 1) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<1; + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp << 1; + BO += temp << 1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S index 60f379d25..3048e3480 100644 --- a/kernel/power/znrm2.S +++ b/kernel/power/znrm2.S @@ -105,7 +105,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) mr NN, N mr XX, X diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S index 1d0c598f8..4ef2212df 100644 --- a/kernel/power/znrm2_hummer.S +++ b/kernel/power/znrm2_hummer.S @@ -134,7 +134,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) mr XX, X diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S index 778b805de..f775c3e62 100644 --- a/kernel/power/znrm2_ppc440.S +++ b/kernel/power/znrm2_ppc440.S @@ -112,7 +112,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) mr NN, N mr XX, X diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 0da66fa35..2798a870e 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c DDOTKERNEL = dot_vector.c CDOTKERNEL = zdot_vector.c ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = dsdot_vector.c SNRM2KERNEL = nrm2_vector.c DNRM2KERNEL = nrm2_vector.c diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index 61a8a2b91..15bcd2289 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c CAXPYKERNEL = ../riscv64/zaxpy.c ZAXPYKERNEL = ../riscv64/zaxpy.c +SAXPBYKERNEL = ../riscv64/axpby.c +DAXPBYKERNEL = ../riscv64/axpby.c +CAXPBYKERNEL = ../riscv64/zaxpby.c +ZAXPBYKERNEL = ../riscv64/zaxpby.c + SCOPYKERNEL = ../riscv64/copy.c DCOPYKERNEL = ../riscv64/copy.c CCOPYKERNEL = ../riscv64/zcopy.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B new file mode 100644 index 000000000..fec69ee09 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -0,0 +1,243 @@ +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c +STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c +STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c +STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c +DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c +DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c +DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c +ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c +ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c +ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c + +SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c +SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c + +DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c +DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c + +CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c +CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + +ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c +ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + +CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c +CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + +ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c +ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B new file mode 100644 index 000000000..d8690682f --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -0,0 +1,199 @@ +SAMAXKERNEL = amax_vector.c +DAMAXKERNEL = amax_vector.c +CAMAXKERNEL = zamax_vector.c +ZAMAXKERNEL = zamax_vector.c + +SAMINKERNEL = amin_vector.c +DAMINKERNEL = amin_vector.c +CAMINKERNEL = zamin_vector.c +ZAMINKERNEL = zamin_vector.c + +SMAXKERNEL = max_vector.c +DMAXKERNEL = max_vector.c + +SMINKERNEL = min_vector.c +DMINKERNEL = min_vector.c + +ISAMAXKERNEL = iamax_vector.c +IDAMAXKERNEL = iamax_vector.c +ICAMAXKERNEL = izamax_vector.c +IZAMAXKERNEL = izamax_vector.c + +ISAMINKERNEL = iamin_vector.c +IDAMINKERNEL = iamin_vector.c +ICAMINKERNEL = izamin_vector.c +IZAMINKERNEL = izamin_vector.c + +ISMAXKERNEL = imax_vector.c +IDMAXKERNEL = imax_vector.c + +ISMINKERNEL = imin_vector.c +IDMINKERNEL = imin_vector.c + +SASUMKERNEL = asum_vector.c +DASUMKERNEL = asum_vector.c +CASUMKERNEL = zasum_vector.c +ZASUMKERNEL = zasum_vector.c + +SSUMKERNEL = sum_vector.c +DSUMKERNEL = sum_vector.c +CSUMKERNEL = zsum_vector.c +ZSUMKERNEL = zsum_vector.c + +SAXPYKERNEL = axpy_vector.c +DAXPYKERNEL = axpy_vector.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c + +SCOPYKERNEL = copy_vector.c +DCOPYKERNEL = copy_vector.c +CCOPYKERNEL = zcopy_vector.c +ZCOPYKERNEL = zcopy_vector.c + +SDOTKERNEL = dot_vector.c +DDOTKERNEL = dot_vector.c +CDOTKERNEL = zdot_vector.c +ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = nrm2_vector.c +DNRM2KERNEL = nrm2_vector.c +CNRM2KERNEL = znrm2_vector.c +ZNRM2KERNEL = znrm2_vector.c + +SROTKERNEL = rot_vector.c +DROTKERNEL = rot_vector.c +CROTKERNEL = zrot_vector.c +ZROTKERNEL = zrot_vector.c + +SSCALKERNEL = scal_vector.c +DSCALKERNEL = scal_vector.c +CSCALKERNEL = zscal_vector.c +ZSCALKERNEL = zscal_vector.c + +SSWAPKERNEL = swap_vector.c +DSWAPKERNEL = swap_vector.c +CSWAPKERNEL = zswap_vector.c +ZSWAPKERNEL = zswap_vector.c + +SGEMVNKERNEL = gemv_n_vector.c +DGEMVNKERNEL = gemv_n_vector.c +CGEMVNKERNEL = zgemv_n_vector.c +ZGEMVNKERNEL = zgemv_n_vector.c + +SGEMVTKERNEL = gemv_t_vector.c +DGEMVTKERNEL = gemv_t_vector.c +CGEMVTKERNEL = zgemv_t_vector.c +ZGEMVTKERNEL = zgemv_t_vector.c + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_vector.c +SSYMV_L_KERNEL = symv_L_vector.c +DSYMV_U_KERNEL = symv_U_vector.c +DSYMV_L_KERNEL = symv_L_vector.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + +CHEMV_L_KERNEL = zhemv_LM_vector.c +CHEMV_M_KERNEL = zhemv_LM_vector.c +CHEMV_U_KERNEL = zhemv_UV_vector.c +CHEMV_V_KERNEL = zhemv_UV_vector.c +ZHEMV_L_KERNEL = zhemv_LM_vector.c +ZHEMV_M_KERNEL = zhemv_LM_vector.c +ZHEMV_U_KERNEL = zhemv_UV_vector.c +ZHEMV_V_KERNEL = zhemv_UV_vector.c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 new file mode 100644 index 000000000..86708fe01 --- /dev/null +++ b/kernel/riscv64/KERNEL.x280 @@ -0,0 +1,281 @@ +# ********************************************************************************** +# Copyright (c) 2022, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ********************************************************************************** + +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c +ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c + +# SGEMM_UNROLL_N set in params.h +ifeq ($(SGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +SGEMMKERNEL = gemmkernel_rvv_v1x8.c +SGEMMINCOPY = gemm_ncopy_rvv_v1.c +SGEMMITCOPY = gemm_tcopy_rvv_v1.c +SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMKERNEL = trmmkernel_rvv_v1x8.c + +STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +SSYMMUCOPY_M = symm_ucopy_rvv_v1.c +SSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +# SGEMM_UNROLL_N set in params.h +ifeq ($(DGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +DGEMMKERNEL = gemmkernel_rvv_v1x8.c +DGEMMINCOPY = gemm_ncopy_rvv_v1.c +DGEMMITCOPY = gemm_tcopy_rvv_v1.c +DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = trmmkernel_rvv_v1x8.c +DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +DSYMMUCOPY_M = symm_ucopy_rvv_v1.c +DSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +CGEMMKERNEL = zgemmkernel_rvv_v1x4.c +CGEMMINCOPY = zgemm_ncopy_rvv_v1.c +CGEMMITCOPY = zgemm_tcopy_rvv_v1.c +CGEMMONCOPY = zgemm_ncopy_4_rvv.c +CGEMMOTCOPY = zgemm_tcopy_4_rvv.c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c + +ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c +ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c +ZGEMMONCOPY = zgemm_ncopy_4_rvv.c +ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c +ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c +ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c + +CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c new file mode 100644 index 000000000..451fbc834 --- /dev/null +++ b/kernel/riscv64/amax_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b778d3e55..b66d4871e 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -28,36 +28,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -65,103 +70,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_zero; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_zero = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -169,94 +99,27 @@ asm volatile( BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c new file mode 100644 index 000000000..5186d7b12 --- /dev/null +++ b/kernel/riscv64/amin_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index fd2f83dc9..c4578eabf 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -26,232 +26,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include -#include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define ABS fabs +# else +# define ELEN 32 +# define ABS fabsf +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define ABS fabs +# else +# define ELEN 32 +# define ABS fabsf +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - if (n <= 0 || inc_x <= 0) return(0.0); - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT minf=0.0; + if (n <= 0 || inc_x <= 0) return(minf); + + minf = ABS(*x); + x += inc_x; + --n; + if (n == 0) return(minf); + unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(minf, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); + asumf = VFMVFS_FLOAT_M1(v_res); + return(asumf); +} diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index a82275153..a652eafdd 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -28,111 +28,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; - BLASLONG ix=0; FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ gvl = VSETVL(n); - v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0; n -= vl, y += vl*inc_y) { + vl = VSETVL(n); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + + } else { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { + vl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + } + + return(0); +} diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index 988c57ec2..721aad2b0 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -27,31 +27,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 +# define LMUL m4 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) + int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { - if (n < 0) return(0); + if (n <= 0) return(0); BLASLONG i=0, j=0; unsigned int gvl = 0; @@ -60,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG stride_x, stride_y, ix = 0, iy = 0; + if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */ + + if ( beta == 0.0 ) + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = 0.0 ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + + + } + + } + else + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = beta * y[iy] ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] + beta * y[iy] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + + } + + return(0); + + } else { /* vectorized approach for non-zero increments */ + if(beta == 0.0){ if(alpha == 0.0){//alpha == 0 && beta == 0 if(inc_y == 1){ @@ -372,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * } } return(0); + } } diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c index fb1094dd9..19d12ad3f 100644 --- a/kernel/riscv64/axpy.c +++ b/kernel/riscv64/axpy.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS BLASLONG i=0; BLASLONG ix,iy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c new file mode 100644 index 000000000..2d5293f76 --- /dev/null +++ b/kernel/riscv64/axpy_rvv.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_M1_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSEV_FLOAT_M1 __riscv_vse32_v_f32m1 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_M1_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSEV_FLOAT_M1 __riscv_vse64_v_f64m1 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( n <= 0 ) return(0); + if ( da == 0.0 ) return(0); + + FLOAT_V_T vx, vy; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT(y, vy, vl); + } + + } else if (1 == inc_x && 0 != inc_y) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } else if( 0 == inc_y ) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + size_t in_vl = VSETVL(n); + vy = VFMVVF_FLOAT( y[0], in_vl ); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + } + FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 ); + vres = VFREDSUMVS_FLOAT( vy, vres, in_vl ); + VSEV_FLOAT_M1(y, vres, 1); + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 98b9f6814..c77a18afa 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -25,24 +25,45 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif +#else +# define LMUL m4 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_M1_T JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) + +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -53,7 +74,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T vy0, vy1; BLASLONG stride_x, stride_y; - if (n < 0) return(0); + if (n <= 0) return(0); if (da == 0.0) return(0); if (inc_x == 1 && inc_y == 1) { @@ -111,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSEV_FLOAT(&y[j], vy0, gvl); j += gvl; } - }else if(inc_x == 1){ + } else if (1 == inc_x && 0 != inc_y) { stride_y = inc_y * sizeof(FLOAT); gvl = VSETVL(n); if(gvl <= n/2){ @@ -139,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); j += gvl; } + } else if( 0 == inc_y ) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + size_t in_vl = VSETVL(n); + vy0 = VFMVVF_FLOAT( y[0], in_vl ); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + vx0 = VLSEV_FLOAT(x, stride_x, vl); + vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl); + } + FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 ); + v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl ); + y[0] = EXTRACT_FLOAT(v_res); }else{ stride_x = inc_x * sizeof(FLOAT); stride_y = inc_y * sizeof(FLOAT); diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..bd615389c --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,996 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='cgemm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c new file mode 100644 index 000000000..7980c029a --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c @@ -0,0 +1,1931 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=1 + N=8 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f32m1' + VFMUL='__riscv_vfmul_vf_f32m1' + VLEV='__riscv_vle32_v_f32m1' + VLSEV='__riscv_vlse32_v_f32m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m1' + VSETVL='__riscv_vsetvl_e32m1' + VSEV='__riscv_vse32_v_f32m1' + VSSEV='__riscv_vsse32_v_f32m1' + acc_vector_t='vfloat32m1_t' + output='cgemm_kernel_8x8_zvl256b.c' + param_scalar_t='float' + param_vector_t='vfloat32m1_t' + +*/ + +#include "common.h" + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if(inc_x == 1) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index a46136d6c..ccbd6e482 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0, j=0; @@ -58,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/4){ BLASLONG inc_xv = inc_x * gvl; - BLASLONG gvl3 = gvl * 3; + unsigned int gvl3 = gvl * 3; BLASLONG inc_xv3 = inc_xv * 3; for(i=0,j=0; i 0; n -= vl, x += vl, y += vl) { + vl = __riscv_vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); + + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); +#else + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); + + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); +#endif + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = __riscv_vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); + + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); +#else + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); + + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); +#endif + } + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = __riscv_vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); + + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); +#else + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); + + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); +#endif + } + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = __riscv_vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); + + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); +#else + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); + + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); +#endif + } + } + + vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax); + dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum); + + return(dot); +} diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c index 64efc6c40..38ccc4778 100644 --- a/kernel/riscv64/dot_vector.c +++ b/kernel/riscv64/dot_vector.c @@ -27,31 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT vfdot_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT vfdot_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) #endif #if defined(DSDOT) @@ -63,7 +69,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG i=0, j=0; double dot = 0.0 ; - if ( n < 0 ) return(dot); + if ( n < 1 ) return(dot); FLOAT_V_T vr, vx, vy; unsigned int gvl = 0; @@ -82,8 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) j += gvl; } if(j > 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -93,13 +99,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_y == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -119,14 +124,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_x == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -146,15 +149,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else{ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -174,9 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } } return(dot); diff --git a/kernel/riscv64/dsdot_vector.c b/kernel/riscv64/dsdot_vector.c new file mode 100644 index 000000000..e972828b5 --- /dev/null +++ b/kernel/riscv64/dsdot_vector.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0, j=0; + double dot = 0.0 ; + + if ( n < 1 ) return(dot); + vfloat64m4_t vr; + vfloat32m2_t vx, vy; + unsigned int gvl = 0; + vfloat64m1_t v_res, v_z0; + gvl = vsetvlmax_e64m1(); + v_res = vfmv_v_f_f64m1(0, gvl); + v_z0 = vfmv_v_f_f64m1(0, gvl); + + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + }else if(inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else if(inc_x == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else{ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + } + return(dot); +} diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..c1e0da86e --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,660 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dtrmm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c new file mode 100644 index 000000000..b1739f248 --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c @@ -0,0 +1,1068 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='dtrmm_kernel_8x8_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(LEFT) != defined(TRANSA) + #define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + vx = VLEV_FLOAT(c_offset, vl); + vx = VFMULVF_FLOAT(vx, beta, vl); + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c new file mode 100644 index 000000000..c652ab0c0 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -0,0 +1,197 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_8.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + + size_t vl; + + //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 3); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + vx4 = VSET_VX4(vx4, 0, v1); + vx4 = VSET_VX4(vx4, 1, v2); + vx4 = VSET_VX4(vx4, 2, v3); + vx4 = VSET_VX4(vx4, 3, v4); + + VSSEG4_FLOAT(b_offset, vx4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + vx2 = VSET_VX2(vx2, 0, v1); + vx2 = VSET_VX2(vx2, 1, v2); + + VSSEG2_FLOAT(b_offset, vx2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..2d6db15e5 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -0,0 +1,76 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda; + + for(i = m; i > 0; i--) { + v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(b_offset, v0, vl); + + a_offset1++; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c new file mode 100644 index 000000000..4742ae6a7 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -0,0 +1,273 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + + // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + for(j = (m >> 3); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 8; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 32; + } + + if (n & 2) { + size_t vl = 8; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + aoffset1 += 2; + boffset3 += 16; + } + + if (n & 1) { + size_t vl = 8; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 8; + } + + } + + if (m & 4) { + + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 4; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 4; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 2) { + size_t vl = 4; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + aoffset1 += 2; + boffset3 += 8; + } + + if (n & 1) { + size_t vl = 4; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 4; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 2; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 2; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 2) { + size_t vl = 2; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + aoffset1 += 2; + boffset3 += 4; + } + + if (n & 1) { + size_t vl = 2; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 2; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 4) { + size_t vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 2) { + size_t vl = 2; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset3, v0, vl); + + aoffset1 += 2; + // boffset3 += 2; + } + + if (n & 1) { + *(boffset4) = *(aoffset1); + // aoffset1 ++; + // boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..c5fb6479f --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl; + + for(i = m; i > 0; i--) { + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset, v0, vl); + + aoffset1 += lda; + boffset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c new file mode 100644 index 000000000..471b3158f --- /dev/null +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -0,0 +1,601 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#endif + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + // N:8 + for (j = bn/8; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + C4 = C3 + ldc; + C5 = C4 + ldc; + C6 = C5 + ldc; + C7 = C6 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrba += vl; + ptrbb += 8; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VLEV_FLOAT(C4, vl); + va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VLEV_FLOAT(C5, vl); + va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VLEV_FLOAT(C6, vl); + va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VLEV_FLOAT(C7, vl); + va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); + VSEV_FLOAT(C7, va7, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + + bb += (bk<<3); + C += (ldc<<3); + } + + // N:4 + if (bn & 4) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrba += vl; + ptrbb += 4; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + // N:2 + if (bn & 2) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrba += vl; + ptrbb += 2; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + C0 += vl; + C1 += vl; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + // N:1 + if (bn & 1) { + C0 = C; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrba += vl; + ptrbb += 1; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + C0 += vl; + } + + bb += (bk); + C += (ldc); + } + + return 0; +} diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c new file mode 100644 index 000000000..1366eb5ad --- /dev/null +++ b/kernel/riscv64/gemv_n_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + if(n < 0) return(0); + + FLOAT *a_ptr, *x_ptr; + BLASLONG i; + FLOAT_V_T va, vy; + + if(inc_y == 1) { + + for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLEV_FLOAT(y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSEV_FLOAT(y, vy, vl); + } + + } else { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLSEV_FLOAT(y, stride_y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + return(0); +} diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index 32ca8618b..aa13fc87d 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c new file mode 100644 index 000000000..9c859aa50 --- /dev/null +++ b/kernel/riscv64/gemv_t_rvv.c @@ -0,0 +1,118 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT *a_ptr, *x_ptr; + + FLOAT_V_T va, vx, vr; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1; + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + if(inc_x == 1) { + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLEV_FLOAT(x_ptr, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLSEV_FLOAT(x_ptr, stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } + + return(0); +} diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index 7683641fa..62b85164c 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -27,107 +27,110 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define FLOAT_V_T vfloat32m2_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT vfdot_vv_f32m4 -#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) +#define xint_t int +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define FLOAT_V_T vfloat64m2_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT vfdot_vv_f64m4 -#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) +#define xint_t long long #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i = 0, j = 0, k = 0; - BLASLONG ix = 0, iy = 0; - FLOAT *a_ptr = a; + BLASLONG i = 0, j = 0, k = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; FLOAT temp; FLOAT_V_T va, vr, vx; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + if(inc_x == 1){ for(i = 0; i < n; i++){ + v_res = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m); j = 0; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < m/gvl; k++){ va = VLEV_FLOAT(&a_ptr[j], gvl); vx = VLEV_FLOAT(&x[j], gvl); - vr = VFMACCVV_FLOAT(vr, va, vx, gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail j += gvl; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp = (FLOAT)VFMVFS_FLOAT(v_res); if(j < m){ gvl = VSETVL(m-j); va = VLEV_FLOAT(&a_ptr[j], gvl); vx = VLEV_FLOAT(&x[j], gvl); vr = VFMULVV_FLOAT(va, vx, gvl); - - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp += (FLOAT)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); } + temp = (FLOAT)EXTRACT_FLOAT(v_res); y[iy] += alpha * temp; + + iy += inc_y; a_ptr += lda; } }else{ BLASLONG stride_x = inc_x * sizeof(FLOAT); - for(i = 0; i < n; i++){ + v_res = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m); - BLASLONG inc_xv = inc_x * gvl; j = 0; ix = 0; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < m/gvl; k++){ va = VLEV_FLOAT(&a_ptr[j], gvl); vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vr = VFMACCVV_FLOAT(vr, va, vx, gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); j += gvl; - ix += inc_xv; + ix += inc_x * gvl; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp = (FLOAT)VFMVFS_FLOAT(v_res); if(j < m){ gvl = VSETVL(m-j); va = VLEV_FLOAT(&a_ptr[j], gvl); vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(va, vx, gvl); - - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp += (FLOAT)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); } + temp = (FLOAT)EXTRACT_FLOAT(v_res); y[iy] += alpha * temp; + + iy += inc_y; a_ptr += lda; } } + + return(0); } - diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py new file mode 100755 index 000000000..8be7c9f9c --- /dev/null +++ b/kernel/riscv64/generate_kernel.py @@ -0,0 +1,673 @@ +#!/usr/bin/python3 + +import sys, os +import contextlib + +#----------------------------------------------------------------------- +def ERROR(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + sys.exit(-1) + +class Target(object): + def __init__( self, out, mappings, initial_level=0, tab_width=4 ): + self._level = initial_level + self._tab_width = tab_width + self._out = out + self._mappings = mappings + + @contextlib.contextmanager + def map( self, **items ): + old_mappings = self._mappings + self._mappings = dict(old_mappings, **items) + yield self._mappings + self._mappings = old_mappings + + @contextlib.contextmanager + def block( self, start=None, end=None, **args ): + with self.map(**args): + if start is not None: + self.write(); + self.write(start) + self._level += 1 + yield self._level + self._level -= 1 + if end is not None: + self.write(end) + self.write() + + def write( self, fmt=None, *args, **kwargs ): + if fmt is not None: + mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings + self._out(self._indent_str() + fmt.format(*args, **mappings)) + else: + self._out("") + + def _indent_str( self ): + return ' ' * (self._level * self._tab_width) + +#----------------------------------------------------------------------- +def generate_trmm_block( dest ): + dest.write("{index_type} pass_K = K;") + dest.write("#ifdef LEFT") + with dest.block(): + dest.write("{index_type} off = offset + m_top;") + dest.write("#else") + with dest.block(): + dest.write("{index_type} off = -offset + n_top;") + dest.write("#endif") + + dest.write("#ifdef BACKWARDS") + with dest.block(): + dest.write("ai += off*{M}{elt_size};") + dest.write("bi += off*{N}{elt_size};") + dest.write("pass_K -= off;") + dest.write("#else") + with dest.block(): + dest.write("#ifdef LEFT") + with dest.block(): + dest.write("pass_K = off + {M};") + dest.write("#else") + with dest.block(): + dest.write("pass_K = off + {N};") + dest.write("#endif") + dest.write("#endif") + +#----------------------------------------------------------------------- +def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ): + TRMM = (settings['op'].value == 'trmm') + narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value + + with dest.map( + M=M, + N=N, + ): + dest.write("{index_type} ai=m_top*K{elt_size};") + dest.write("{index_type} bi=n_top*K{elt_size};") + if TRMM: + generate_trmm_block( dest ) + + for i in range(N): + dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i) + dest.write("bi += {N};") + dest.write() + + for i in range(a_regs): + dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i) + dest.write("ai += {M};") + dest.write() + + for j in range(N): + for i in range(a_regs): + dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j) + + with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): + for i in range(N): + dest.write("B{i} = B[bi+{i}];", i=i ) + dest.write("bi += {N};") + dest.write() + + for i in range(a_regs): + dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i) + + dest.write("ai += {M};") + dest.write() + + + for j in range(N): + for i in range(a_regs): + dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i ) + + dest.write() + dest.write("{index_type} ci=n_top*ldc+m_top;") + dest.write() + + if narrow_result: + for j in range(N): + for i in range(a_regs): + dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i) + + if not TRMM: + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment) + + if narrow_result: + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + if TRMM: + dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx) + else: + dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx) + else: + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + if TRMM: + dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx) + else: + dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx) + + + if not TRMM: + dest.write() + dest.write("ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment) + + +#----------------------------------------------------------------------- +def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ): + TRMM = (settings['op'].value == 'trmm') + narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value + + if narrow_result: + raise RuntimeError("wide accumulator not supported for generated complex kernels") + # we could, but we run out of registers really really fast + + with dest.map( + M=M, + N=N, + ): + dest.write("{index_type} ai=m_top*K*2;") + dest.write("{index_type} bi=n_top*K*2;") + if TRMM: + generate_trmm_block( dest ) + + for i in range(N): + dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i) + dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i) + dest.write("bi += {N}*2;") + dest.write() + + for i in range(a_regs): + dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("ai += {M}*2;") + dest.write() + + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N + dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k", + a_regs=a_regs*2, accumulation_regs=accumulation_regs*2 + ) + pass_regs = (accumulation_regs + a_regs)*2 + tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs + if tmp_regs < 2: + raise RuntimeError("Complex kernel would use too many registers!") + + dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs) + + tmp_unroll_i = min(tmp_regs, a_regs) + tmp_unroll_j = N + while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j: + tmp_unroll_j = int(tmp_unroll_j / 2) + + if tmp_unroll_i < a_regs or tmp_unroll_j < N: + dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i) + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + if ti == 0 and tj==0: + dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + else: + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;") + dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;") + + with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): + for i in range(N): + dest.write("B{i}r = B[bi+{i}*2+0];", i=i) + dest.write("B{i}i = B[bi+{i}*2+1];", i=i) + dest.write("bi += {N}*2;") + dest.write() + + for i in range(a_regs): + dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) + + dest.write("ai += {M}*2;") + dest.write() + + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign + # so accumulating them directly to ACC would lose precision when ACC is larger + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);") + dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);") + + dest.write() + dest.write("{index_type} ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + if TRMM: + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );") + dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );") + else: + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );") + dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );") + dest.write("{increment}") + + if not TRMM: + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );") + + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );") + + if not TRMM: + dest.write() + dest.write("ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);") + dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);") + dest.write("{increment}") + +#----------------------------------------------------------------------- +def generate_gemm_kernel( settings, OUTPUT ): + if settings['conjugate'].value: + ERROR('conjugate gemm not yet supported') + + is_complex = settings['complex'].value + generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real + dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings }) + + M = settings['M'].value + N = settings['N'].value + vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value / + settings['ELEN_PARAM'].value) + a_regs = max(int(M/vlenmax), 1) + + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N + required_regs = accumulation_regs + a_regs + if is_complex: + required_regs = required_regs * 2 + 2 + dest.write(''' +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfmacc{tail_policy} +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfmsac{tail_policy} +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfnmsac{tail_policy} +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfnmacc{tail_policy} +#endif +'''.format(tail_policy=settings['tail_policy'].value)) + + + if required_regs > (32 // settings['LMUL_ACC'].value): + raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format( + required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value + )) + + TRMM = (settings['op'].value == 'trmm') + if TRMM: + with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"): + dest.write("#define BACKWARDS") + + dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})", + alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'), + trmm = (', BLASLONG offset' if TRMM else '') + ) + + with dest.block("{{", "}}", elt_size='*2' if is_complex else ''): + if settings['trace'].value: + dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);") + dest.write("{index_type} gvl = 0;") + dest.write("{index_type} m_top = 0;") + dest.write("{index_type} n_top = 0;") + + dest.write() + dest.write() + dest.write("// -- MAIN PASS") + + with dest.block("for ({index_type} j=0; j 0 ): + with dest.map(N=N_tail): + dest.write() + dest.write() + dest.write("// -- tails for N={N}") + with dest.block("if( N & {N} ) {{", "}}" ): + if settings['trace'].value: + dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) + dest.write("m_top = 0;") + with dest.block("for ({index_type} i=0; i M_tail_min ): + with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + a_regs = max( 1, int(M_tail/vlenmax) ) + vlen = int(M_tail/a_regs) + dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen) + + generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs ) + dest.write( "m_top += {M_tail};" ) + + M_tail = int( M_tail / 2 ) + + while( M_tail > 0 ): + with dest.block("if( M & {M_tail} ) {{", "}}", + M_tail=M_tail, + N=N, + result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) + ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + for r in range(M_tail * N * (2 if is_complex else 1)): + dest.write("{result_t} result{r} = 0;", + r=r + ) + + dest.write("{index_type} ai=m_top*K{elt_size};") + dest.write("{index_type} bi=n_top*K{elt_size};") + + if TRMM: + with dest.map(M=M_tail, N=N): + generate_trmm_block( dest ) + + with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ): + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2 + )) + dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2 + )) + else: + dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format( + dest=ki*M_tail+kj, kj=kj, ki=ki + )) + dest.write("ai+={M_tail}{elt_size};") + dest.write("bi+={N}{elt_size};") + + dest.write("{index_type} ci=n_top*ldc+m_top;") + if is_complex: + dest.write("{result_t} Cr, Ci;") + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + if TRMM: + dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + else: + dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj) + dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj) + dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1) + dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0) + dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj ) + dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj ) + else: + op = '' if TRMM else '+' + dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};", + ki=ki, kj=kj, op=op, dest=ki*M_tail+kj + ) + dest.write("m_top+={M_tail};") + + M_tail = int(M_tail/2) + + +#----------------------------------------------------------------------- +class Setting(object): + def __init__( self, value, convert = None ): + self._value = value + self._convert = convert + + @classmethod + def ENUM( cls, *values ): + def closure( values ): + return lambda value: values[value.lower()] + return closure( { v.lower():v for v in values } ) + + @classmethod + def BOOL( cls, value ): + return value.lower().startswith('t') or value == '1' + + @property + def value( self ): + return self._value + + @property + def configurable( self ): + return self._convert is not None + + @value.setter + def value( self, value ): + self._value = self._convert( value ) + + def __str__( self ): + return str(self._value) + +#----------------------------------------------------------------------- +def main(): + settings = { + 'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ), + 'M': Setting( 16, int ), + 'N': Setting( 4, int ), + 'reg_width_bits': Setting( 256, int ), + 'LMUL': Setting( 1, int ), + 'M_tail_scalar_from':Setting( 2, int ), + 'cpu': Setting( 'zvl256b', str ), + 'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ), + 'force_acc_double': Setting( False, Setting.BOOL ), + 'complex': Setting( False, Setting.BOOL ), + 'conjugate': Setting( False, Setting.BOOL ), + 'index_type': Setting( 'BLASLONG', str ), + 'trace': Setting( False, Setting.BOOL ), + 'output': Setting( None, str ), + 'tail_policy': Setting( '', str ), # _ta, if toolchain supports it + '__riscv_': Setting( '__riscv_', str), + } + + for item in sys.argv[1:]: + try: + name, value = tuple(item.split( '=', 1 )) + except: + ERROR("couldn't parse {}, expected arguments of the form name=value".format(item)) + + if name not in settings: + ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name ) + +"options (and current defaults) are\n{}".format( + " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()])) + ) + + try: + settings[name].value = value + except: + import traceback + traceback.print_exc() + ERROR("couldn't parse {}".format(item)) + + if settings['output'].value is None: + if settings['complex'].value: + prefix = 'z' if settings['param_precision'].value == 'double' else 'c' + else: + prefix = 'd' if settings['param_precision'].value == 'double' else 's' + settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format( + prefix, + settings['op'], + settings['M'], + settings['N'], + settings['cpu'] + )) + + if settings['param_precision'].value == 'double': + settings['param_scalar_t'] = Setting( 'double' ) + settings['ELEN_PARAM'] = Setting(64) + else: + settings['param_scalar_t'] = Setting( 'float' ) + settings['ELEN_PARAM'] = Setting(32) + + settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + + settings['ELEN_ACC'] = settings['ELEN_PARAM'] + settings['LMUL_ACC'] = Setting(settings['LMUL'].value) + widen = '' + + if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'): + settings['ELEN_ACC'] = Setting(64) + settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2) + settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy'])) + widen = 'w' + + settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + + settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL'])) + settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC'])) + settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'])) + + + to_stdout = (settings['output'].value == '-') + if not to_stdout: + print("Writing {}".format(settings['output'].value), file=sys.stderr) + + with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file: + def OUTPUT(*args, **kwargs): + print(*args, file=destination_file, **kwargs) + + OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable]))) + OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable]))) + + OUTPUT('#include "common.h"') + OUTPUT("\n") + + if settings['op'].value in ('gemm', 'trmm'): + generate_gemm_kernel(settings, OUTPUT) + else: + ERROR("unsupported kernel type {}".format(settings['op'])) + +if __name__ == "__main__": + main() diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c new file mode 100644 index 000000000..8362d7cef --- /dev/null +++ b/kernel/riscv64/iamax_rvv.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index ecb4cd7a9..800312400 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -27,118 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m4_f64m1(v_res, va, vb, gvl) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m4)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m4_f64m1) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m4) +#endif +#define MASK_T vbool16_t +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m4_b16) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m4) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m4_b16) +#define VMFIRSTM RISCV_RVV(vfirst_m_b16) +#define UINT_V_T vuint64m4_t +#define VIDV_UINT RISCV_RVV(vid_v_u64m4) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m4) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m4) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m4) +#define VMV_X RISCV_RVV(vmv_x_s_u64m4_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m4_f32m1(v_res, va, vb, gvl) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m4)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m4_f32m1) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m4) +#endif +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m4_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m4) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m4_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) +#define UINT_V_T vuint32m4_t +#define VIDV_UINT RISCV_RVV(vid_v_u32m4) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m4) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m4) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m4) +#define VMV_X RISCV_RVV(vmv_x_s_u32m4_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); + + gvl = VSETVL(n); + UINT_V_T vid = VIDV_UINT(gvl); if(inc_x == 1){ - gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLEV_FLOAT(&x[j], gvl); + v_max = VFABS_FLOAT(v_max, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -146,51 +151,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c new file mode 100644 index 000000000..f90dbb545 --- /dev/null +++ b/kernel/riscv64/iamin_rvv.c @@ -0,0 +1,150 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index c72bb94cc..0e591e697 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -31,85 +31,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 -#define UINT_V_T vuint64m8_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl) #define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 #define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif +#define MASK_T vbool8_t +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) +#define UINT_V_T vuint64m8_t +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 -#define UINT_V_T vuint32m8_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl) #define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 #define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif +#define MASK_T vbool4_t +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) +#define UINT_V_T vuint32m8_t +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -117,29 +125,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLEV_FLOAT(&x[j], gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -151,12 +159,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -165,33 +171,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c new file mode 100644 index 000000000..b1a77b178 --- /dev/null +++ b/kernel/riscv64/imax_rvv.c @@ -0,0 +1,146 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index c2d787ab8..308fa15a4 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -31,68 +31,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m8_f64m1(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m8_f32m1(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); - FLOAT maxf=-FLT_MAX; + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -104,32 +116,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -145,37 +159,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c new file mode 100644 index 000000000..1de7f3233 --- /dev/null +++ b/kernel/riscv64/imin_rvv.c @@ -0,0 +1,146 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index dfe9a3310..ec36b8eb9 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -31,122 +31,119 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl) +#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u64m8_m)(mask, v_min_index, gvl) +#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u64m8_m)(mask, a, a, b, gvl) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl) +#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u32m8_m)(mask, v_min_index, gvl) +#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u32m8_m)(mask, a, a, b, gvl) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -159,59 +156,39 @@ asm volatile( for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c new file mode 100644 index 000000000..32f66a7a7 --- /dev/null +++ b/kernel/riscv64/izamax_rvv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + FLOAT_VX2_T vxx2; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); + } + } + else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); + } + + } + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index fdbdc3ae8..d33e89c00 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -27,241 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f64m8_f64m1)(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) #define UINT_V_T vuint64m8_t -#define VSEVU_UINT vse64_v_u64m8 +#define VSEVU_UINT RISCV_RVV(vse64_v_u64m8) #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f32m8_f32m1)(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8) +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; - FLOAT_V_T vx0, vx1, v_max; + FLOAT_V_T vx, vx2, v_max; UINT_V_T v_max_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); + //index where element greater than v_max - mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - vx0 = VFMVVF_FLOAT(0, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); - mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_max_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = VFMVFS_FLOAT(v_res); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_max = VFADDVV_FLOAT(v_max, vx2, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); + if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); - mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } - return(max_index+1); -} - + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c new file mode 100644 index 000000000..d34b220fa --- /dev/null +++ b/kernel/riscv64/izamin_rvv.c @@ -0,0 +1,171 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + FLOAT_VX2_T vxx2; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 59c720310..c76a38099 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,235 +31,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f64m8_f64m1)(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) #define UINT_V_T vuint64m8_t #define VSEVU_UINT vse64_v_u64m8 #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f32m8_f32m1)(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f32m8_f32m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8) +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; - FLOAT_V_T vx0, vx1, v_min; + FLOAT_V_T vx, vx2, v_min; UINT_V_T v_min_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; - v_min_index = VMVVX_UINT(0, gvl); + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; + v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); - //index where element less than v_min - mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, gvl); + v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = VFMVFS_FLOAT(v_res); - mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_min_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = VFMVFS_FLOAT(v_res); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_min = VFADDVV_FLOAT(v_min, vx2, gvl); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); - mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; - + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } - return(min_index+1); -} - + return(min_index+1); +} diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c new file mode 100644 index 000000000..745c27bf4 --- /dev/null +++ b/kernel/riscv64/max_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index b988513c9..ee9920cd2 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) #endif +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_max = VFMAXVV_FLOAT(v_max, v1, gvl); j += gvl * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c new file mode 100644 index 000000000..78528fef9 --- /dev/null +++ b/kernel/riscv64/min_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index be0803df6..2001840bb 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) #endif +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, v1, gvl); j += gvl * 2; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); } for(;j= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_y == 1) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_x == 1) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c index 9b48d1c69..649d9bb94 100644 --- a/kernel/riscv64/rot_vector.c +++ b/kernel/riscv64/rot_vector.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFMSACVF_FLOAT vfmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFMSACVF_FLOAT vfmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG ix=0,iy=0; if(n <= 0) return(0); - unsigned int gvl = 0; + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); FLOAT_V_T v0, v1, vx, vy; if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); for(i=0,j=0; i 0; n -= vl, x += vl) { + vl = VSETVL(n); + VSEV_FLOAT(x, v0, vl); + } + } + else { + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + } + else { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + } + } + + return 0; +} diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c index 7a3153b7c..8fa9315f6 100644 --- a/kernel/riscv64/scal_vector.c +++ b/kernel/riscv64/scal_vector.c @@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } }else{ if(da == 0.0){ + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG ix = 0; gvl = VSETVL(n); - BLASLONG stride_x = inc_x * sizeof(FLOAT); - BLASLONG ix = 0; - if(gvl <= n / 2){ - long int inc_xv = gvl * inc_x; - v0 = VFMVVF_FLOAT(0, gvl); - for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ - VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); - VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl); - ix += inc_xv * 2; - } + v0 = VFMVVF_FLOAT(0, gvl); + + for(i = 0; i < n/(gvl*2); ++i ){ + VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); + ix += inc_x * gvl; + VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); + ix += inc_x * gvl; } - //tail - for(; j 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); + sumf = VFMVFS_FLOAT_M1(v_res); + return(sumf); +} diff --git a/kernel/riscv64/sum_vector.c b/kernel/riscv64/sum_vector.c new file mode 100644 index 000000000..cf734faab --- /dev/null +++ b/kernel/riscv64/sum_vector.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + + if(inc_x == 1){ + gvl = VSETVL(n); + if(gvl <= n/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i 0; m -= vl, ptr -= vl*inc_y) { + vl = VSETVL(m); + vy = VLSEV_FLOAT(ptr - 1, stride_y, vl); + VSSEV_FLOAT(ptr, stride_y, vy, vl); + } + y[0] = temp; + } + else if(inc_y == 0) { + FLOAT temp = y[0]; + y[0] = x[(n - 1) * inc_x]; + FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one + stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) { + vl = VSETVL(m); + vx = VLSEV_FLOAT(ptr - 1, stride_x, vl); + VSSEV_FLOAT(ptr, stride_x, vx, vl); + } + x[0] = temp; + } + else if(inc_x == 1 && inc_y == 1) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else if (inc_y == 1) { + stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + + } else if(inc_x == 1) { + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index d9421e2f1..f583f5392 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; - if (n < 0) return(0); + if (n <= 0) return(0); + + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } + if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX + posY * lda; + ao2 = a + posY + (posX) * lda; + + for (i = m; i > 0; i--, offset--) { + va2 = VLSEV_FLOAT(ao2, stride_lda, vl); + va1 = VLEV_FLOAT(ao1, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1 += lda; + ao2++; + } + } + + return 0; +} + diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c new file mode 100644 index 000000000..464f97b3a --- /dev/null +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_ucopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posX + 0 + posY * lda; + + for (i = m; i > 0; i--, offset--) { + va1 = VLSEV_FLOAT(ao1, stride_lda, vl); + va2 = VLEV_FLOAT(ao2, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1++; + ao2 += lda; + } + } + + return 0; +} diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c new file mode 100644 index 000000000..888d628a5 --- /dev/null +++ b/kernel/riscv64/symv_L_rvv.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1, vl; + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + if(inc_x == 1 && inc_y == 1) + { + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jy += inc_y; + a_ptr += lda; + } + } + else if(inc_y == 1) + { + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + ix += inc_xv; + } + + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + a_ptr += lda; + } + } + else + { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index 6588f4dda..cd89c63ec 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -27,37 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -99,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); @@ -110,8 +116,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[j] += alpha * temp2; @@ -144,8 +150,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -155,8 +161,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[jy] += alpha * temp2; @@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); @@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[j] += alpha * temp2; @@ -241,8 +247,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -252,8 +258,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[jy] += alpha * temp2; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c new file mode 100644 index 000000000..bcd2f6981 --- /dev/null +++ b/kernel/riscv64/symv_U_rvv.c @@ -0,0 +1,216 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + FLOAT_V_T_M1 v_res, v_z0; + size_t vl_max = VSETVL_MAX_M1, vl; + v_z0 = VFMVVF_FLOAT_M1(0, vl_max); + vl_max = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + if(inc_x == 1 && inc_y == 1) + { + a_ptr += m1 * lda; + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jy += inc_y; + } + } + else if(inc_y == 1) + { + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + ix += inc_xv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + } + } + else + { + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 31104eae6..894c6a643 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,39 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT vfdot_vv_f32m4 -#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT vfdot_vv_f64m4 -#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -101,8 +107,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); @@ -112,8 +118,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -145,8 +151,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -156,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; @@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); @@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -240,8 +246,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -251,8 +257,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c new file mode 100644 index 000000000..4135a9b62 --- /dev/null +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..580714fde --- /dev/null +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..852ab7f11 --- /dev/null +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda; + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X > posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..e0b6d362d --- /dev/null +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -0,0 +1,133 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T vb, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + ao ++; + b += vl; + X ++; + i++; + } + else if (X > posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + ao += lda; + b += vl; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c new file mode 100644 index 000000000..393b24bce --- /dev/null +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -0,0 +1,685 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_8x8.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc); + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + for (j = bn/8; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + C4 = C3+ldc; + C5 = C4+ldc; + C6 = C5+ldc; + C7 = C6+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*8; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+8; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VFMULVF_FLOAT(vres4, alpha, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VFMULVF_FLOAT(vres5, alpha, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VFMULVF_FLOAT(vres6, alpha, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VFMULVF_FLOAT(vres7, alpha, vl); + VSEV_FLOAT(C7, va7, vl); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 8; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*8; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; +#endif + + bb += (bk<<3); + C += (ldc<<3); + } + + if (bn & 4) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*4; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk<<2); + C += (ldc<<2); + } + + if (bn & 2) + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + bb += (bk<<1); + C += (ldc<<1); + } + + if (bn & 1) + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*1; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + bb += (bk); + C += (ldc); + } + return 0; +} + diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c new file mode 100644 index 000000000..869561fb3 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -0,0 +1,364 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT aa; + FLOAT* pc; + + int i, j, k; + + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + + FLOAT_V_T vb, vc; + + size_t vl; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = 0; k < i; k ++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); + } + pc += vl * ldc; + } + a -= m; + b -= 2 * n; + } + +} +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; + + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + + FLOAT_VX2_T vbx2, vsx2, vcx2; + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); +#ifndef CONJ + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); +#else + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); +#endif + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); + b += vl * 2; + + for (k = 0; k < i; k ++) { + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); +#ifndef CONJ + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#endif + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); + } + pc += vl * ldc * 2; + } + a -= m * 2; + b -= 4 * n; + } +} + + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * j * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c new file mode 100644 index 000000000..da443cfba --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -0,0 +1,341 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa; + FLOAT* pc; + + int i, j, k; + + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + + FLOAT_V_T vb, vc; + + size_t vl; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = i + 1; k < m; k++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); + } + pc += vl * ldc; + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; + + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + + FLOAT_VX2_T vbx2, vsx2, vcx2; + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; + + ldc *= 2; + + for (i = 0; i < m; i++) { + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); +#ifndef CONJ + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); +#else + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); +#endif + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); + b += vl * 2; + + for (k = i + 1; k < m; k++) { + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); +#ifndef CONJ + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#endif + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); + } + pc += vl * ldc * 2; + } + + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c new file mode 100644 index 000000000..32e481036 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -0,0 +1,337 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT *pci, *pcj; + + int i, j, k; + FLOAT_V_T va, vc; + + size_t vl; + for (i = 0; i < n; i++) { + + bb = *(b + i); + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = i + 1; k < n; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); + } + pcj += vl; + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb1, bb2; + + FLOAT *pci, *pcj; + + int i, j, k; + + FLOAT_VX2_T vax2, vsx2, vcx2; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; + + size_t vl; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + pci = c + i * ldc * 2; + pcj = c; + + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); +#ifndef CONJ + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); +#else + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); +#endif + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); + a += vl * 2; + pci += vl * 2; + + for (k = i + 1; k < n; k ++){ + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); +#ifndef CONJ + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); +#endif + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); + } + pcj += vl * 2; + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = vl; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c new file mode 100644 index 000000000..81cc41818 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -0,0 +1,356 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT *pci, *pcj; + + int i, j, k; + FLOAT_V_T va, vc; + + size_t vl; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = 0; k < i; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); + } + pcj += vl; + } + b -= n; + a -= 2 * m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb1, bb2; + + FLOAT *pci, *pcj; + + int i, j, k; + + FLOAT_VX2_T vax2, vsx2, vcx2; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; + + size_t vl; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + pci = c + i * ldc * 2; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); +#ifndef CONJ + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); +#else + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); +#endif + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); + a += vl * 2; + pci += vl * 2; + + for (k = 0; k < i; k ++){ + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); +#ifndef CONJ + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); +#endif + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); + } + pcj += vl * 2; + } + b -= n * 2; + a -= 4 * m; + } +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - j) * vl * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..41c84be25 --- /dev/null +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 + +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + *(b + j) = INV(*(ao + j * lda)); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..003bd3465 --- /dev/null +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j)); + + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + b += vl; + ao += lda; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..6cca5d49c --- /dev/null +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j * lda)); + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..bc058525f --- /dev/null +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + *(b + j) = INV(*(ao + j)); + + ao += lda; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c new file mode 100644 index 000000000..180cf059a --- /dev/null +++ b/kernel/riscv64/zamax_rvv.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T v0, v1, vmax; + FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); + + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 9dbeba90f..ec4a5a1e9 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -28,39 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 - +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) #endif +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -70,10 +78,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -83,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for(; i maxf) - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c new file mode 100644 index 000000000..56a467502 --- /dev/null +++ b/kernel/riscv64/zamin_rvv.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T v0, v1, vmin; + FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index dc58075ac..45b3e0b9d 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -29,38 +29,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -69,10 +80,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -82,10 +91,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for(; i 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v1 = VLEV_FLOAT(x+vl, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); + } + + } + else { + + int stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v1 = VLSEV_FLOAT(x+1, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); + asumf += VFMVFS_FLOAT_M1(v_res); + + return(asumf); +} diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 8386ab62e..fca904d6a 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -28,37 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -67,12 +77,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; gvl = VSETVL(n2); @@ -81,26 +88,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); + vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); + + vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); + + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + } + } + else + { + FLOAT_V_T v0, v1; + FLOAT_VX2_T v_x2; + + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + v0 = VFMULVF_FLOAT(vy1, beta_i, vl); + v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); + + v1 = VFMULVF_FLOAT(vy1, beta_r, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + VSSSEG_FLOAT(y, stride_y, v_x2, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); + v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); + v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); + v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); + + v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); + v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + + VSSSEG_FLOAT(y, stride_y, v_x2, vl); + } + } + } + return(0); + +} diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 3eca20415..bbf2bbe7d 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFMSACVF_FLOAT vfmsac_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFMSACVF_FLOAT vfmsac_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) @@ -62,6 +62,82 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL stride_x = inc_x * 2 * sizeof(FLOAT); stride_y = inc_y * 2 * sizeof(FLOAT); + if (inc_x == 0 || inc_y == 0) { + + FLOAT temp; + BLASLONG inc_x2, inc_y2; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + y[iy] = 0.0 ; + y[iy+1] = 0.0 ; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + } + else + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + + + } + return(0); + + } else { + if(beta_r == 0.0 && beta_i == 0.0){ if(alpha_r == 0.0 && alpha_i == 0.0){ if(inc_y == 1){ @@ -191,5 +267,6 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL } } return(0); + } } diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c index 1dcaeac27..18b6315cb 100644 --- a/kernel/riscv64/zaxpy.c +++ b/kernel/riscv64/zaxpy.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c new file mode 100644 index 000000000..0db32df10 --- /dev/null +++ b/kernel/riscv64/zaxpy_rvv.c @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if(n < 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); + } + + } else if (inc_x == 1) { + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); + } + + } else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 303d3541e..1e766c5f4 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - BLASLONG ix = 0,iy = 0; - if(n < 0) return(0); - if(da_r == 0.0 && da_i == 0.0) return(0); + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + if(n <= 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); unsigned int gvl = 0; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c index 07fe584c5..b0f19efd5 100644 --- a/kernel/riscv64/zcopy.c +++ b/kernel/riscv64/zcopy.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c new file mode 100644 index 000000000..13879f03b --- /dev/null +++ b/kernel/riscv64/zcopy_rvv.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_M8(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 __riscv_vle32_v_f32m8 +#define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_VX2_T_M4 vfloat32m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4x2 +#else +#define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 __riscv_vle64_v_f64m8 +#define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_VX2_T_M4 vfloat64m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4x2 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n <= 0) return(0); + + if(inc_x == 1 && inc_y == 1) { + + FLOAT_V_T_M8 vx; + n *= 2; // convert to words + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL_M8(n); + vx = VLEV_FLOAT_M8(x, vl); + VSEV_FLOAT_M8(y, vx, vl); + } + + }else if (1 == inc_x) { + + FLOAT_VX2_T_M4 vx2; + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + vx2 = VLSEG_FLOAT_M4(x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); + } + } else if (1 == inc_y) { + + FLOAT_VX2_T_M4 vx2; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL_M4(n); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSEG_FLOAT_M4(y, vx2, vl); + } + } else { + + FLOAT_VX2_T_M4 vx2; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 600f02bba..9e4a67b71 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) #endif diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c new file mode 100644 index 000000000..13bc2ee39 --- /dev/null +++ b/kernel/riscv64/zdot_rvv.c @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + OPENBLAS_COMPLEX_FLOAT result; + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + + if ( n <= 0 ) return(result); + + FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; + FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vyx2; + size_t vlmax_m1 = VSETVL_MAX_M1; + v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); + + size_t vlmax = VSETVL_MAX; + vr0 = VFMVVF_FLOAT(0, vlmax); + vr1 = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); + #endif + } + + } else if (inc_x == 1){ + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); + #endif + } + } else if (inc_y == 1){ + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); + #endif + } + }else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); + #endif + } + } + + v_res = VFREDSUM_FLOAT(vr0, v_z0, vlmax); + CREAL(result) = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, vlmax); + CIMAG(result) = VFMVFS_FLOAT_M1(v_res); + + return(result); +} diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index ec38ed9d2..13b8fe378 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT vfdot_vv_f32m4 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFMSACVV_FLOAT vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT vfdot_vv_f64m4 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFMSACVV_FLOAT vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) @@ -109,9 +117,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); dot[0] += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); dot[1] += VFMVFS_FLOAT(v_res); //tail if(j < n){ @@ -132,9 +140,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA vr1 = VFMULVV_FLOAT(vx1, vy0, gvl); vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl); #endif - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); dot[0] += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); dot[1] += VFMVFS_FLOAT(v_res); } CREAL(result) = dot[0]; diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c new file mode 100644 index 000000000..ee334801b --- /dev/null +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, + FLOAT beta_r, FLOAT beta_i, + FLOAT *dummy2, BLASLONG dummy3, + FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vr, vi, v1, v2, v3, v4; + FLOAT_VX2_T vx2; + + ldc *= 2; + c_offset = c; + + if (beta_r == 0.0 && beta_i == 0.0) { + + vl = VSETVL(m); + vr = VFMVVF_FLOAT(0.0, vl); + vi = VFMVVF_FLOAT(0.0, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VSSEG_FLOAT(c_offset, vx2, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + vx2 = VLSEG_FLOAT(c_offset, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); + + v1 = VFMULVF_FLOAT(vr, beta_r, vl); + v2 = VFMULVF_FLOAT(vi, beta_i, vl); + + v3 = VFMULVF_FLOAT(vi, beta_r, vl); + v4 = VFMULVF_FLOAT(vr, beta_i, vl); + + vr = VFSUBVV_FLOAT(v1, v2, vl); + vi = VFADDVV_FLOAT(v3, v4, vl); + + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(c_offset, vx2, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..0776f03fd --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c @@ -0,0 +1,720 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='zgemm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c new file mode 100644 index 000000000..ca33368f0 --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c @@ -0,0 +1,1253 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=1 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='zgemm_kernel_8x4_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j N ) + n_packing >>= 1; + + BLASLONG m_packing = UNROLL_M; + BLASLONG m_top = 0; + while (m_top < M) + { + while( m_top+m_packing > M ) + m_packing >>= 1; + + BLASLONG ai = K*m_top*2; + BLASLONG bi = K*n_top*2; + + BLASLONG pass_K = K; + + + #ifdef TRMMKERNEL + #ifdef LEFT + BLASLONG off = offset + m_top; + #else + BLASLONG off = -offset + n_top; + #endif + #ifdef BACKWARDS + ai += off * m_packing*2; + bi += off * n_packing*2; + pass_K -= off; + #else + #ifdef LEFT + pass_K = off + m_packing; + #else + pass_K = off + n_packing; + #endif + #endif + #endif + + memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) ); + + for (BLASLONG k=0; k> 2); j > 0; j--) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + v3x2 = VLSEG2_FLOAT(aoffset3, vl); + v4x2 = VLSEG2_FLOAT(aoffset4, vl); + + vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0)); + vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1)); + vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0)); + vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1)); + vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0)); + vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1)); + vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0)); + vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1)); + + VSSEG8_FLOAT(boffset, vxx8, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + aoffset3 += vl * 2; + aoffset4 += vl * 2; + boffset += vl * 8; + } + } + + if (n & 2) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + + vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0)); + vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1)); + vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0)); + vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1)); + + VSSEG4_FLOAT(boffset, vxx4, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + boffset += vl * 4; + } + } + + if (n & 1) { + aoffset1 = aoffset; + aoffset += lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + + VSSEG2_FLOAT(boffset, v1x2, vl); + + aoffset1 += vl * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..275daa5f2 --- /dev/null +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_VX2_T vx2; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda * 2; + + for(i = m; i > 0; i--) { + vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, vx2, vl); + + a_offset1 += 2; + b_offset += vl * 2; + } + } + return 0; +} + diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c new file mode 100644 index 000000000..cfafbf0dc --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -0,0 +1,191 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + + size_t vl; + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 2); i > 0; i--) { + vl = 4; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 4; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 1) { + vl = 4; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + aoffset1 += 2; + boffset3 += 8; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 2); i > 0; i--) { + vl = 2; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 2; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 1) { + vl = 2; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + //aoffset1 += 2; + boffset3 += 4; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 2); i > 0; i--) { + vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 2) { + vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 1) { + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..96e986502 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_VX2_T vx2; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl * 2; + + for(i = m; i > 0; i--) { + vx2 = VLSEG2_FLOAT(aoffset1, vl); + VSSEG2_FLOAT(boffset, vx2, vl); + + aoffset1 += lda * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c new file mode 100644 index 000000000..77e012ff5 --- /dev/null +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -0,0 +1,553 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = (bk & 3); k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + vax2 = VLSEG2_FLOAT(C2, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C3, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = (bk & 3); k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + C0 += vl * 2; + C1 += vl * 2; + } + + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + } + + for (k = (bk & 3); k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + } + + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + C0 += vl * 2; + } + + bb += bk << 1; + C += ldc << 1; + } + return 0; +} + diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c new file mode 100644 index 000000000..f14ef5ba8 --- /dev/null +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -0,0 +1,192 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix; + FLOAT *a_ptr; + FLOAT temp_r, temp_i; + FLOAT_V_T va0, va1, vy0, vy1; + FLOAT_VX2_T vax2, vyx2; + + BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; + + BLASLONG inc_x2 = inc_x * 2; + BLASLONG lda2 = lda * 2; + if (inc_y == 1) + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + vyx2 = VLSEG_FLOAT(y, vl); + + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + vax2 = VLSEG_FLOAT(a_ptr, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); + } + + } + else + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + vax2 = VLSEG_FLOAT(a_ptr, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + } + return(0); +} diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index b5ee1f054..104d3865d 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c new file mode 100644 index 000000000..1c89a9f72 --- /dev/null +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -0,0 +1,187 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; + FLOAT temp_r, temp_i; + + FLOAT_V_T va0, va1, vx0, vx1, vr, vi; + FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vax2; + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + //BLASLONG stride_a = sizeof(FLOAT) * 2; + BLASLONG inc_y2 = inc_y * 2; + BLASLONG lda2 = lda * 2; + + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL(m); + + if (inc_x == 1) + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSEG_FLOAT(&x[ix], vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + } + else + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + + } + + + return(0); +} diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index e930dc2a2..5d85ab3a4 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -27,31 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() +#define FLOAT_V_T vfloat32m2_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(vr, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() +#define FLOAT_V_T vfloat64m2_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(vr, va, vb, gvl) +#else +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -62,49 +70,43 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT temp_r, temp_i; FLOAT_V_T va0, va1, vx0, vx1, vr, vi; - unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); - + unsigned int gvl = VSETVL(m); + FLOAT_V_T_M1 v_res_r, v_res_i; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; BLASLONG stride_a = sizeof(FLOAT) * 2; - gvl = VSETVL(m); BLASLONG inc_xv = inc_x * gvl * 2; BLASLONG inc_av = gvl * 2; BLASLONG inc_y2 = inc_y * 2; BLASLONG lda2 = lda * 2; + for(i = 0; i < n; i++){ + v_res_r = VFMVVF_FLOAT_M1(0, 1); + v_res_i = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m); j = 0; ix = 0; - vr = VFMVVF_FLOAT(0, gvl); - vi = VFMVVF_FLOAT(0, gvl); for(k = 0; k < m/gvl; k++){ va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl); + vr = VFMULVV_FLOAT(va0, vx0, gvl); + vi = VFMULVV_FLOAT(va0, vx1, gvl); vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl); vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl); #else - vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl); + vr = VFMULVV_FLOAT(va0, vx0, gvl); + vi = VFMULVV_FLOAT(va0, vx1, gvl); vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl); vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); - #endif + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); + j += inc_av; ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp_r = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); - temp_i = VFMVFS_FLOAT(v_res); if(j/2 < m){ gvl = VSETVL(m-j/2); va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); @@ -113,21 +115,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMULVV_FLOAT(va0, vx0, gvl); - vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); vi = VFMULVV_FLOAT(va0, vx1, gvl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl); #else vr = VFMULVV_FLOAT(va0, vx0, gvl); - vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); vi = VFMULVV_FLOAT(va0, vx1, gvl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); #endif - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp_r += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); - temp_i += VFMVFS_FLOAT(v_res); + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); } + + temp_r = VFMVFS_FLOAT(v_res_r); + temp_i = VFMVFS_FLOAT(v_res_i); + #if !defined(XCONJ) y[iy] += alpha_r * temp_r - alpha_i * temp_i; y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..97013895a --- /dev/null +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -0,0 +1,139 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; + VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + posX * lda * 2; + + for (i = m; i > 0; i--, offset--) { + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c new file mode 100644 index 000000000..59029e9e5 --- /dev/null +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; + VBOOL_T vbool_gt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + posX * lda * 2; + ao2 = a + posX * 2 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c new file mode 100644 index 000000000..95c6a377c --- /dev/null +++ b/kernel/riscv64/zhemv_LM_rvv.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + jx = 0; + jy = 0; + ja = 0; + for(j = 0; j < offset; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + ix = jx + inc_x2; + iy = jy + inc_y2; + ia = ja + 2; + i = j + 1; + len = m - i; + if(len > 0){ + gvl = VSETVL(len); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + + if(i < m){ + unsigned int gvl_rem = VSETVL(m-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); +#else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); +#endif + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + } + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index 275ee9131..117db7d84 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -143,9 +151,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); temp_i2 = VFMVFS_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); @@ -181,9 +189,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); temp_i2 += VFMVFS_FLOAT(v_res); } } diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c new file mode 100644 index 000000000..ec06622fc --- /dev/null +++ b/kernel/riscv64/zhemv_UV_rvv.c @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + BLASLONG m1 = m - offset; + a_ptr = a + m1 * lda2; + jx = m1 * inc_x2; + jy = m1 * inc_y2; + ja = m1 * 2; + for(j = m1; j < m; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + ix = 0; + iy = 0; + ia = 0; + i = 0; + if(j > 0){ + gvl = VSETVL(j); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + + if(i < j){ + unsigned int gvl_rem = VSETVL(j-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); +#else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); +#endif + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + } + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 2f46977d4..7c6b63bf3 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -142,9 +150,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); temp_i2 = VFMVFS_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); @@ -180,9 +188,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); temp_i2 += VFMVFS_FLOAT(v_res); } } diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c index fc1c8b54a..28bb0eda5 100644 --- a/kernel/riscv64/znrm2.c +++ b/kernel/riscv64/znrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c new file mode 100644 index 000000000..32f67758a --- /dev/null +++ b/kernel/riscv64/znrm2_rvv.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define MASK_T vbool8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f32m4_b8 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define MASK_T vbool16_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f64m4_b16 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + + if (n <= 0 || inc_x <= 0) return(0.0); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; + if (inc_x == 1) { + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for (i=0,j=0; i 0 ){ // scale change? + // find largest element in v0 and v1 + v_res = VFREDMAX( v0, v_z0, gvl ); + v_res = VFREDMAX( v1, v_res, gvl ); + FLOAT const largest_elt = EXTRACT_FLOAT( v_res ); + + v_scale = VFDIV( v_scale, largest_elt, gvl ); // scale/largest_elt + v_scale = VFMUL( v_scale, v_scale, gvl ); // (scale/largest_elt)*(scale/largest_elt) + v_ssq = VFMUL( v_scale, v_ssq, gvl ); // ssq*(scale/largest_elt)*(scale/largest_elt) + + v_scale = VFMVVF_FLOAT( largest_elt, gvl ); // splated largest_elt becomes new scale } - //ssq in vector vr: vr[0] - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - //total ssq now - ssq += VFMVFS_FLOAT(v_res); - //tail - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); - //if scale change - mask = VMFGTVF_FLOAT(v0, scale, gvl); - index = VMFIRSTM(mask, gvl); - if(index == -1){//no elements greater than scale - if(scale != 0.0){ - v0 = VFDIVVF_FLOAT(v0, scale, gvl); - vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); + MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl ); + MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl ); + v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl ); + v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl ); + v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl ); + v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl ); + + idx += inc_x * gvl * 2; + } + + v_res = VFREDUSUM(v_ssq, v_z0, gvl); + FLOAT ssq = EXTRACT_FLOAT(v_res); + FLOAT scale = EXTRACT_FLOAT0_V(v_scale); + + //finish any tail using scalar ops + i*=gvl; + if(i 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + } + + return 0; +} diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index 2fdd8135a..50751b343 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT unsigned int gvl = 0; FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; - gvl = VSETVL(n); + gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c index 0521aaa0b..b2d537d04 100644 --- a/kernel/riscv64/zscal.c +++ b/kernel/riscv64/zscal.c @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c new file mode 100644 index 000000000..ae79d9f9d --- /dev/null +++ b/kernel/riscv64/zscal_rvv.c @@ -0,0 +1,112 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if((n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T vt, vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + size_t vlmax = VSETVL_MAX; + FLOAT_VX2_T vx2; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + vx2 = VLSEG_FLOAT(x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 64323aa3a..536bbdf73 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - vt = VFMVVF_FLOAT(0.0, gvl); - for(i=0,j=0; i < n/(gvl*2); i++){ - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); - - j += gvl*2; - ix += inc_xv*2; - } - for(; j < n; ){ - gvl = VSETVL(n-j); - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - j += gvl; - ix += inc_x * 2 * gvl; - } - }else if(da_r == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else if(da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else{ + { gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c new file mode 100644 index 000000000..489188bd5 --- /dev/null +++ b/kernel/riscv64/zsum_rvv.c @@ -0,0 +1,107 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m4_tu +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m4_tu +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); + sumf += VFMVFS_FLOAT_M1(v_res); + + return(sumf); +} diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c new file mode 100644 index 000000000..ca0b02b5c --- /dev/null +++ b/kernel/riscv64/zsum_vector.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); + + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n2/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i 0; m -= vl * 2, ptr -= vl*inc_y * 2) { + vl = VSETVL(m); + vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vyx2, vl); + } + y[0] = temp[0]; + y[1] = temp[1]; + } + else if(inc_y == 0) { + FLOAT temp[2]; + temp[0] = y[0]; + temp[1] = y[1]; + y[0] = x[(n - 1) * inc_x * 2]; + y[0] = x[(n - 1) * inc_x * 2 + 1]; + FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one + BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { + vl = VSETVL(m); + vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vxx2, vl); + } + x[0] = temp[0]; + x[1] = temp[1]; + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + VSSEG_FLOAT(y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + VSSEG_FLOAT(y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index 7550294b5..f2734c4a9 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -27,35 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 64 +# else +# define ELEN 32 +# define MLEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 16 +# else +# define ELEN 32 +# define MLEN 8 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } - if (n < 0) return(0); + if (n <= 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + (posX) * lda * 2; + + for (i = m; i > 0; i--, offset--) { + + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c new file mode 100644 index 000000000..069551bb0 --- /dev/null +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; + + FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + (posX + 0) * lda * 2; + ao2 = a + posX * 2 + 0 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c new file mode 100644 index 000000000..cefdea7f6 --- /dev/null +++ b/kernel/riscv64/zsymv_L_rvv.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFNEGV_FLOAT __riscv_vfneg_v_f32mf4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFNEGV_FLOAT __riscv_vfneg_v_f64mf4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv, len; + + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0){ + gvl = VSETVL(len); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + + if(i < m){ + unsigned int gvl_rem = VSETVL(m-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); + + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + } + y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0]; + + jx += inc_x; + jy += inc_y; + a_ptr += 2 * lda; + } + + return(0); +} + diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c new file mode 100644 index 000000000..67b5a649c --- /dev/null +++ b/kernel/riscv64/zsymv_U_rvv.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + iy = 0; + i = 0; + gvl = VSETVL(j); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + + if(i < j){ + unsigned int gvl_rem = VSETVL(j-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); + + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); + + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + } + + y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1] + alpha_r * temp2[1] + alpha_i * temp2[0]; + + a_ptr += 2 * lda; + jx += inc_x; + jy += inc_y; + } + + return(0); +} + diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..d7d5e5fea --- /dev/null +++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c @@ -0,0 +1,805 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='ztrmm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c new file mode 100644 index 000000000..de7622b89 --- /dev/null +++ b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c @@ -0,0 +1,1337 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=1 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='ztrmm_kernel_8x4_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfnmacc +#endif + + +#if defined(LEFT) != defined(TRANSA) + #define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT __riscv_vid_v_i32m2 +#define VMSGTU_VX_UINT __riscv_vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); + + ao += 2; + b += vl * 2; + + X ++; + i ++; + } + else if (X < posY) + { + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); +#endif + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..ab8d34337 --- /dev/null +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X < posY) + { + //va1 = VLEV_FLOAT(ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); + + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + //va1 = VLEV_FLOAT(ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); +#endif + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..ba6e63b96 --- /dev/null +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + FLOAT *ao; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); + + ao += 2; + b += vl * 2; + + X++; + i++; + } + else if (X > posY) + { + ao += lda * 2; + b += vl * 2; + + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); +#endif + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..a624fff54 --- /dev/null +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -0,0 +1,151 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X > posY) + { + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); + ao += lda * 2; + b += vl * 2; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); +#endif + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c new file mode 100644 index 000000000..399124d2e --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -0,0 +1,596 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEG4_FLOAT __riscv_vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEG4_FLOAT __riscv_vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c + + +/******************************** + ADD1 a*c + ADD2 b*c + ADD3 a*d + ADD4 b*d + *********************************/ +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, + FLOAT* C,BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T_M1 v_m1_res0, v_m1_res1; + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + C0 = C; + C1 = C0+2*ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 2; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 4; + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[2] = res0 * alphar - res1 * alphai; + C1[3] = res1 * alphar + res0 * alphai; +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + + ptrba += temp*2*2; + ptrbb += temp*2*2; + +#endif + +#ifdef LEFT + off += 2; +#endif + + C0 = C0+4; + C1 = C1+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 4; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + C1 = C1+2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 2; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 2; + + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp*2; + +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + } + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + return 0; +} diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c new file mode 100644 index 000000000..db5f06af8 --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -0,0 +1,632 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + BLASLONG off, temp; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*4*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = temp & 3; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + va0 = VFMULVF_FLOAT(vres4, alphar, vl); + va1 = VFMULVF_FLOAT(vres5, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); + + va2 = VFMULVF_FLOAT(vres6, alphar, vl); + va3 = VFMULVF_FLOAT(vres7, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*4*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = temp & 3; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + C1 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + ptrbb += 2; + } + + for (k = temp & 3; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + bb += bk << 1; + C += ldc << 1; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..36cec711d --- /dev/null +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_VX2_T vax2; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); + + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..3a7bdb522 --- /dev/null +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_VX2_T vax2; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + vax2 = VLSEG2_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); + + b += vl * 2; + ao += lda * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..2a158d4de --- /dev/null +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT *ao; + jj = offset; + + FLOAT_VX2_T vax2; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..4b3319588 --- /dev/null +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_VX2_T vax2; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + vax2 = VLSEG2_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); + + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + ao += lda * 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + + return 0; +} diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 14a339e75..4c361f155 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -49,7 +50,9 @@ static void init_parameter(void); gotoblas_t TABLE_NAME = { - DTB_DEFAULT_ENTRIES , + DTB_DEFAULT_ENTRIES, + + SWITCH_RATIO, GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, @@ -63,6 +66,7 @@ gotoblas_t TABLE_NAME = { #endif SBGEMM_ALIGN_K, + 0, // need_amxtile_permission sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, @@ -1806,6 +1810,12 @@ static void init_parameter(void) { #endif +#ifdef SAPPHIRERAPIDS +#if (BUILD_BFLOAT16 == 1) + TABLE_NAME.need_amxtile_permission = 1; +#endif +#endif + #if BUILD_COMPLEX==1 #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; diff --git a/kernel/sparc/cnrm2.S b/kernel/sparc/cnrm2.S index 8dc4b56b6..0840c8848 100644 --- a/kernel/sparc/cnrm2.S +++ b/kernel/sparc/cnrm2.S @@ -76,7 +76,7 @@ FMOV c1, t4 cmp INCX, 0 - ble .LL20 + beq .LL20 sll INCX, ZBASE_SHIFT, INCX cmp N, 0 diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S index cf7522953..41e993440 100644 --- a/kernel/sparc/dnrm2.S +++ b/kernel/sparc/dnrm2.S @@ -107,7 +107,7 @@ FMOV fzero, c1 cmp INCX, 0 - ble .LL99 + beq .LL99 sll INCX, BASE_SHIFT, INCX add %sp, -8, %sp diff --git a/kernel/sparc/snrm2.S b/kernel/sparc/snrm2.S index a80247259..a7405b6e1 100644 --- a/kernel/sparc/snrm2.S +++ b/kernel/sparc/snrm2.S @@ -76,7 +76,7 @@ FMOV c1, t4 cmp INCX, 0 - ble .LL20 + beq .LL20 sll INCX, BASE_SHIFT, INCX cmp N, 0 diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S index 065d22784..dae53ffe7 100644 --- a/kernel/sparc/znrm2.S +++ b/kernel/sparc/znrm2.S @@ -107,7 +107,7 @@ FMOV fzero, c1 cmp INCX, 0 - ble .LL99 + beq .LL99 sll INCX, ZBASE_SHIFT, INCX add %sp, -8, %sp diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S index 7a14da862..3a6417462 100644 --- a/kernel/x86/nrm2.S +++ b/kernel/x86/nrm2.S @@ -78,7 +78,7 @@ testl M, M jle .L999 testl INCX, INCX - jle .L999 + je .L999 sall $BASE_SHIFT, INCX fldz diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S index 0f174c408..129b41a03 100644 --- a/kernel/x86/nrm2_sse.S +++ b/kernel/x86/nrm2_sse.S @@ -69,7 +69,7 @@ jle .L999 pxor %xmm1, %xmm1 testl INCX, INCX - jle .L999 + je .L999 leal (, INCX, SIZE), INCX cmpl $SIZE, INCX diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S index 263612e9a..7a65df77a 100644 --- a/kernel/x86/znrm2.S +++ b/kernel/x86/znrm2.S @@ -78,7 +78,7 @@ testl M, M jle .L999 testl INCX, INCX - jle .L999 + je .L999 sall $ZBASE_SHIFT, INCX fldz diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S index bbc3677ae..4ad326120 100644 --- a/kernel/x86/znrm2_sse.S +++ b/kernel/x86/znrm2_sse.S @@ -69,7 +69,7 @@ jle .L999 pxor %xmm1, %xmm1 testl INCX, INCX - jle .L999 + je .L999 sall $ZBASE_SHIFT, INCX diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S index 1eb518563..adb56edb9 100644 --- a/kernel/x86/zscal.S +++ b/kernel/x86/zscal.S @@ -98,7 +98,7 @@ fcomip %st(1), %st ffreep %st(0) jne .L30 - +jp .L30 EMMS pxor %mm0, %mm0 diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index e011c98f5..89e36251e 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -87,6 +87,7 @@ xorps %xmm7, %xmm7 comiss %xmm0, %xmm7 jne .L100 # Alpha_r != ZERO + jp .L100 # Alpha_r NaN comiss %xmm1, %xmm7 jne .L100 # Alpha_i != ZERO diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index cc7ab6686..0bc61b209 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -98,6 +98,7 @@ xorps %xmm7, %xmm7 comisd %xmm0, %xmm7 jne .L100 + jp .L100 comisd %xmm1, %xmm7 jne .L100 diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index bea7036c2..ec4290e82 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -405,7 +405,7 @@ DGEMVNKERNEL = dgemv_n.S endif ifndef DGEMVTKERNEL -DGEMVTKERNEL = dgemv_t.S +DGEMVTKERNEL = dgemv_t_4.c endif ifndef CGEMVNKERNEL @@ -489,5 +489,7 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S SSUMKERNEL = ../arm/sum.c DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = zsum_sse.S +ZSUMKERNEL = zsum_sse2.S SOMATCOPY_RT = omatcopy_rt.c diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index dba94aea8..22b042029 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -1,5 +1,5 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX - +ifneq ($(NO_AVX512BF16), 1) SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c @@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index 88f574668..0ab2b4ddc 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1,6 +1,15 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE -SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c +ifneq ($(NO_AVX512BF16), 1) +SBGEMM_SMALL_M_PERMIT = +SBGEMM_SMALL_K_NN = +SBGEMM_SMALL_K_B0_NN = +SBGEMM_SMALL_K_NT = +SBGEMM_SMALL_K_B0_NT = +SBGEMM_SMALL_K_TN = +SBGEMM_SMALL_K_B0_TN = +SBGEMM_SMALL_K_TT = +SBGEMM_SMALL_K_B0_TT = SBGEMM_BETA = sgemm_beta_skylakex.c SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c @@ -12,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 548e5dcfc..7e946ef2e 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -46,3 +46,5 @@ ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c +CSUMKERNEL = csum.c +ZSUMKERNEL = zsum.c diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index 60feec0ce..28d78d279 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -4,17 +4,17 @@ #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) #endif -#if defined(SKYLAKEX) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "casum_microk_skylakex-2.c" #endif #ifndef HAVE_CASUM_KERNEL -static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) { BLASLONG i=0; BLASLONG n_8 = n & -8; - FLOAT *x = x1; + FLOAT *x1 = x; FLOAT temp0, temp1, temp2, temp3; FLOAT temp4, temp5, temp6, temp7; FLOAT sum0 = 0.0; @@ -24,14 +24,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) FLOAT sum4 = 0.0; while (i < n_8) { - temp0 = ABS_K(x[0]); - temp1 = ABS_K(x[1]); - temp2 = ABS_K(x[2]); - temp3 = ABS_K(x[3]); - temp4 = ABS_K(x[4]); - temp5 = ABS_K(x[5]); - temp6 = ABS_K(x[6]); - temp7 = ABS_K(x[7]); + temp0 = ABS_K(x1[0]); + temp1 = ABS_K(x1[1]); + temp2 = ABS_K(x1[2]); + temp3 = ABS_K(x1[3]); + temp4 = ABS_K(x1[4]); + temp5 = ABS_K(x1[5]); + temp6 = ABS_K(x1[6]); + temp7 = ABS_K(x1[7]); sum0 += temp0; sum1 += temp1; @@ -43,12 +43,12 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) sum2 += temp6; sum3 += temp7; - x+=8; + x1+=8; i+=4; } while (i < n) { - sum4 += (ABS_K(x1[0]) + ABS_K(x1[1])); + sum4 += ABS_K(x1[0]) + ABS_K(x1[1]); x1 += 2; i++; } diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index b398aa6e1..10b70ff20 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -1,5 +1,10 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) #define HAVE_CASUM_KERNEL 1 @@ -15,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) if (n2 < 64) { __m128 accum_10, accum_11, accum_12, accum_13; - __m128 abs_mask1 = abs_mask1; + __m128 abs_mask1; accum_10 = _mm_setzero_ps(); accum_11 = _mm_setzero_ps(); accum_12 = _mm_setzero_ps(); accum_13 = _mm_setzero_ps(); - abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); - abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1); + abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff); _mm_prefetch(&x1[0], _MM_HINT_T0); @@ -347,3 +351,4 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) return sumf; } #endif +#endif diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 3ca173c20..94e6d2c77 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -24,9 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif -#include -#include #include "common.h" #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index 3187e196c..f123e81d1 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -24,7 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - +#if (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #include "common.h" diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c index 8a622427b..a6c012a4c 100644 --- a/kernel/x86_64/cscal_microk_skylakex-2.c +++ b/kernel/x86_64/cscal_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/csum.c b/kernel/x86_64/csum.c new file mode 100644 index 000000000..e85b5cae1 --- /dev/null +++ b/kernel/x86_64/csum.c @@ -0,0 +1,131 @@ +#include "common.h" + +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) +#include "csum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_CSUM_KERNEL +static FLOAT csum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + sum0 += x1[0]; + sum1 += x1[1]; + sum2 += x1[2]; + sum3 += x1[3]; + + sum0 += x1[4]; + sum1 += x1[5]; + sum2 += x1[6]; + sum3 += x1[7]; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += (x1[0] + x1[1]); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = csum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int sum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = sum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = sum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = sum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/csum_microk_skylakex-2.c b/kernel/x86_64/csum_microk_skylakex-2.c new file mode 100644 index 000000000..ec882efa1 --- /dev/null +++ b/kernel/x86_64/csum_microk_skylakex-2.c @@ -0,0 +1,289 @@ +/* need a new enough GCC for avx512 support */ +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) + +#define HAVE_CASUM_KERNEL 1 + +#include + +#include + +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + if (n2 < 64) { + __m128 accum_10, accum_11, accum_12, accum_13; + + accum_10 = _mm_setzero_ps(); + accum_11 = _mm_setzero_ps(); + accum_12 = _mm_setzero_ps(); + accum_13 = _mm_setzero_ps(); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + + if (n2 >= 32){ + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + _mm_prefetch(&x1[16], _MM_HINT_T0); + __m128 x04 = _mm_loadu_ps(&x1[16]); + __m128 x05 = _mm_loadu_ps(&x1[20]); + __m128 x06 = _mm_loadu_ps(&x1[24]); + __m128 x07 = _mm_loadu_ps(&x1[28]); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + accum_10 = _mm_add_ps(accum_10, x04); + accum_11 = _mm_add_ps(accum_11, x05); + accum_12 = _mm_add_ps(accum_12, x06); + accum_13 = _mm_add_ps(accum_13, x07); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + accum_10 = _mm_add_ps(accum_10, x00); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + sumf += (x1[0] + x1[1]); + } + + accum_10 = _mm_add_ps(accum_10, accum_11); + accum_12 = _mm_add_ps(accum_12, accum_13); + accum_10 = _mm_add_ps(accum_10, accum_12); + + accum_10 = _mm_hadd_ps(accum_10, accum_10); + accum_10 = _mm_hadd_ps(accum_10, accum_10); + + sumf += accum_10[0]; + } + else { + __m512 accum_0, accum_1, accum_2, accum_3; + __m512 x00, x01, x02, x03, x04, x05, x06, x07; + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 256) { + if (n2 >= 128) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[ 16]); + x02 = _mm512_loadu_ps(&x1[ 32]); + x03 = _mm512_loadu_ps(&x1[ 48]); + x04 = _mm512_loadu_ps(&x1[ 64]); + x05 = _mm512_loadu_ps(&x1[ 80]); + x06 = _mm512_loadu_ps(&x1[ 96]); + x07 = _mm512_loadu_ps(&x1[112]); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + n2 -= 128; + x1 += 128; + } + + if (n2 >= 64) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x02 = _mm512_loadu_ps(&x1[32]); + x03 = _mm512_loadu_ps(&x1[48]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_ps(&x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + } + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + + sumf = _mm512_reduce_add_ps(accum_0); + } + // n2 >= 256, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + if (0 != align_header) { + uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[ 16]); + x02 = _mm512_load_ps(&x1[ 32]); + x03 = _mm512_load_ps(&x1[ 48]); + x04 = _mm512_load_ps(&x1[ 64]); + x05 = _mm512_load_ps(&x1[ 80]); + x06 = _mm512_load_ps(&x1[ 96]); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + + while (n2 >= 128) { + + accum_0 = _mm512_add_ps(accum_0, x00); + x00 = _mm512_load_ps(&x1[ 0]); + accum_1 = _mm512_add_ps(accum_1, x01); + x01 = _mm512_load_ps(&x1[ 16]); + accum_2 = _mm512_add_ps(accum_2, x02); + x02 = _mm512_load_ps(&x1[ 32]); + accum_3 = _mm512_add_ps(accum_3, x03); + x03 = _mm512_load_ps(&x1[ 48]); + + accum_0 = _mm512_add_ps(accum_0, x04); + x04 = _mm512_load_ps(&x1[ 64]); + accum_1 = _mm512_add_ps(accum_1, x05); + x05 = _mm512_load_ps(&x1[ 80]); + accum_2 = _mm512_add_ps(accum_2, x06); + x06 = _mm512_load_ps(&x1[ 96]); + accum_3 = _mm512_add_ps(accum_3, x07); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + } + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + if (n2 >= 64) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x02 = _mm512_load_ps(&x1[32]); + x03 = _mm512_load_ps(&x1[48]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_ps(&x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + } + + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + sumf = _mm512_reduce_add_ps(accum_0); + } + } + + return sumf; +} +#endif +#endif diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index a9c40f38f..0147c6978 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -4,7 +4,7 @@ #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) #endif -#if defined(SKYLAKEX) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "dasum_microk_skylakex-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dasum_microk_haswell-2.c" diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index fd9da7ebe..bc27c7647 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,4 +1,7 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DASUM_KERNEL diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index 83bc078b3..76b9fbef0 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DASUM_KERNEL 1 diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c index e785a39f1..5b9147d10 100644 --- a/kernel/x86_64/daxpy_microk_skylakex-2.c +++ b/kernel/x86_64/daxpy_microk_skylakex-2.c @@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index f3b9ee701..569ed2416 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -159,7 +159,7 @@ static int dot_thread_function(BLASLONG n, BLASLONG dummy0, extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads); + void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c index 8eabf225a..f076862f7 100644 --- a/kernel/x86_64/ddot_microk_skylakex-2.c +++ b/kernel/x86_64/ddot_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_8 1 diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index a98772b94..da57a18a7 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include #include "common.h" diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 37d1ca497..69ad6d94e 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include #include "common.h" diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c index 4030399ab..4e8739864 100644 --- a/kernel/x86_64/dgemv_n_microk_skylakex-4.c +++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_4x4 1 diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index 40c9cf19d..6fdf4ae56 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -169,7 +169,7 @@ static int rot_thread_function(blas_arg_t *args) return 0; } -extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c index 4e862e663..bf9c044d4 100644 --- a/kernel/x86_64/drot_microk_skylakex-2.c +++ b/kernel/x86_64/drot_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DROT_KERNEL 1 diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c index e0598272e..381136414 100644 --- a/kernel/x86_64/dscal_microk_skylakex-2.c +++ b/kernel/x86_64/dscal_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c index f0df5aaa8..ca4773a4b 100644 --- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/dtobf16_microk_cooperlake.c b/kernel/x86_64/dtobf16_microk_cooperlake.c index 9b8ac4714..b713b39be 100644 --- a/kernel/x86_64/dtobf16_microk_cooperlake.c +++ b/kernel/x86_64/dtobf16_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_TOBF16_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S index b79ac2adb..61cf8c452 100644 --- a/kernel/x86_64/nrm2.S +++ b/kernel/x86_64/nrm2.S @@ -58,7 +58,7 @@ testq M, M jle .L999 testq INCX, INCX - jle .L999 + je .L999 salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S index 33b1ee496..c1f3a45fc 100644 --- a/kernel/x86_64/nrm2_sse.S +++ b/kernel/x86_64/nrm2_sse.S @@ -57,7 +57,7 @@ jle .L999 pxor %xmm1, %xmm1 testq INCX, INCX - jle .L999 + je .L999 pxor %xmm2, %xmm2 leaq (, INCX, SIZE), INCX diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index 37a92468f..3f22cb97a 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -9,7 +9,7 @@ #endif -#if defined(SKYLAKEX) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "sasum_microk_skylakex-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sasum_microk_haswell-2.c" diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index 2eb5b9538..3b4d65cfc 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,4 +1,7 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SASUM_KERNEL 1 diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index fbc91b558..f193053ee 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SASUM_KERNEL 1 diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c index 950f10ba2..bbe4d2bc5 100644 --- a/kernel/x86_64/saxpy_microk_skylakex-2.c +++ b/kernel/x86_64/saxpy_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index 2aefe46ff..ccec98e34 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBDOT_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c index 90e0a32c7..5ee3c8532 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c @@ -97,33 +97,32 @@ typedef struct { #define T_C10 6 #define T_C11 7 -// FIXME: gcc11 seem have problem in tile load/store address calc, -// need to multiply with element size (2 or 4) here. + #define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) #define LOAD_A_TAIL(M, N) {\ __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ - _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ + _tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \ } #define MASK_LOAD_A_TAIL(M, N) {\ __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ - _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ + _tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \ } #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) #define LOAD_B_TAIL(M, N) {\ __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ - _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ + _tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \ } #define MASK_LOAD_B_TAIL(M, N) {\ __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ - _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ + _tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \ } #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c index d875e0d96..c87f9fa5b 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBGEMV_N_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c index 23da2e809..5b7a2e147 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBGEMV_T_ACCL_KERNEL 1 diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c index 1fcb7f27c..f14632f94 100644 --- a/kernel/x86_64/sdot_microk_skylakex-2.c +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 2366fe3aa..6f4309c30 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include #include "common.h" diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c index 308f5e35e..987b090ba 100644 --- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include #include "common.h" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index c9681fa8b..296eded5a 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif diff --git a/kernel/x86_64/sgemv_n_microk_skylakex-8.c b/kernel/x86_64/sgemv_n_microk_skylakex-8.c index fba9cedcd..199621712 100644 --- a/kernel/x86_64/sgemv_n_microk_skylakex-8.c +++ b/kernel/x86_64/sgemv_n_microk_skylakex-8.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && __NVCOMPVERS >= 2203 ) + #define HAVE_SGEMV_N_SKYLAKE_KERNEL 1 #include "common.h" @@ -255,4 +259,4 @@ static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLAS } -#endif \ No newline at end of file +#endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 07aa51503..ea89a2aaf 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif diff --git a/kernel/x86_64/sgemv_t_microk_skylakex.c b/kernel/x86_64/sgemv_t_microk_skylakex.c index dca12acfc..d4f675a1e 100644 --- a/kernel/x86_64/sgemv_t_microk_skylakex.c +++ b/kernel/x86_64/sgemv_t_microk_skylakex.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SGEMV_T_SKYLAKE_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index a49544616..05724b427 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -171,7 +171,7 @@ static int rot_thread_function(blas_arg_t *args) return 0; } -extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c index a21d1cf64..aec25ac56 100644 --- a/kernel/x86_64/srot_microk_skylakex-2.c +++ b/kernel/x86_64/srot_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SROT_KERNEL 1 diff --git a/kernel/x86_64/sscal_microk_skylakex-2.c b/kernel/x86_64/sscal_microk_skylakex-2.c index c4fa160f0..5c13cba55 100644 --- a/kernel/x86_64/sscal_microk_skylakex-2.c +++ b/kernel/x86_64/sscal_microk_skylakex-2.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 45914daf5..4826b00c6 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 26e5ca7e9..06db14ebe 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif diff --git a/kernel/x86_64/stobf16_microk_cooperlake.c b/kernel/x86_64/stobf16_microk_cooperlake.c index 2756a6934..e7d20ddfa 100644 --- a/kernel/x86_64/stobf16_microk_cooperlake.c +++ b/kernel/x86_64/stobf16_microk_cooperlake.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #define HAVE_TOBF16_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c index 80e95a2c8..3f17ab1cf 100644 --- a/kernel/x86_64/zasum.c +++ b/kernel/x86_64/zasum.c @@ -4,7 +4,7 @@ #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) #endif -#if defined(SKYLAKEX) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "zasum_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index e257a5456..f6bc8e37b 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -1,5 +1,10 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) #define HAVE_ZASUM_KERNEL 1 @@ -16,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) if (n2 < 32) { __m128d accum_10, accum_11, accum_12, accum_13; - __m128d abs_mask1 = abs_mask1; + __m128d abs_mask1; accum_10 = _mm_setzero_pd(); accum_11 = _mm_setzero_pd(); accum_12 = _mm_setzero_pd(); accum_13 = _mm_setzero_pd(); - // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); - abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); - abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1); + abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); _mm_prefetch(&x1[0], _MM_HINT_T0); if (n2 >= 16){ @@ -338,3 +341,4 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) return sumf; } #endif +#endif diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S index a7dd054fb..3776c8910 100644 --- a/kernel/x86_64/zaxpy_sse2.S +++ b/kernel/x86_64/zaxpy_sse2.S @@ -1418,10 +1418,10 @@ movq M, %rax //If incx==0 || incy==0, avoid unloop and jump to end. cmpq $0, INCX - je .L58 + jne .L59 cmpq $0, INCY je .L58 - +.L59: sarq $3, %rax jle .L55 diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 27397ccfa..51efa2dfe 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif @@ -92,7 +92,7 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads); + void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 8fc960610..678cea957 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 63c8b11a4..44d545df7 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S index 0d2aa3480..748fde310 100644 --- a/kernel/x86_64/znrm2.S +++ b/kernel/x86_64/znrm2.S @@ -58,7 +58,7 @@ testq M, M jle .L999 testq INCX, INCX - jle .L999 + je .L999 salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S index f78b83f7e..2274f2e98 100644 --- a/kernel/x86_64/znrm2_sse.S +++ b/kernel/x86_64/znrm2_sse.S @@ -58,7 +58,7 @@ jle .L999 pxor %xmm1, %xmm1 testq INCX, INCX - jle .L999 + je .L999 xorq FLAG, FLAG diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 45e3531b8..bc79c0caf 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #include "common.h" - +#include #if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zscal_microk_skylakex-2.c" @@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) for( i=0; i FLT_MAX) { + while(j < n) + { + x[i]= NAN; + x[i+1] = da_r; + i += 2 ; + j++; + + } + + } else { while(j < n) { - temp0 = -da_i * x[i+1]; + if (x[i] < -FLT_MAX || x[i] > FLT_MAX) + temp0 = NAN; x[i+1] = da_i * x[i]; - x[i] = temp0; + if ( x[i] == x[i]) //preserve NaN + x[i] = temp0; i += 2 ; j++; @@ -409,28 +425,24 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - - if ( da_i == 0.0 ) + if (da_i == 0.0) { + while(j < n) + { - while(j < n) - { - temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; i += 2 ; j++; - } - + } } else { while(j < n) { - temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i] = temp0; @@ -439,11 +451,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } - } + } } return(0); } - - diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c index f9e05e333..29dc4f6df 100644 --- a/kernel/x86_64/zscal_microk_skylakex-2.c +++ b/kernel/x86_64/zscal_microk_skylakex-2.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index 223b1e439..d6a49136d 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -82,6 +82,7 @@ pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L100 + jp .L100 comisd %xmm1, %xmm15 jne .L100 diff --git a/kernel/x86_64/zsum.c b/kernel/x86_64/zsum.c new file mode 100644 index 000000000..5973c1253 --- /dev/null +++ b/kernel/x86_64/zsum.c @@ -0,0 +1,131 @@ +#include "common.h" + +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) +#include "zsum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_ZASUM_KERNEL +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + sum0 += x1[0]; + sum1 += x1[1]; + sum2 += x1[2]; + sum3 += x1[3]; + + sum0 += x1[4]; + sum1 += x1[5]; + sum2 += x1[6]; + sum3 += x1[7]; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += x1[0] + x1[1]; + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = zsum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int sum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = sum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = sum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = sum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/zsum_microk_skylakex-2.c b/kernel/x86_64/zsum_microk_skylakex-2.c new file mode 100644 index 000000000..0bca7ce6d --- /dev/null +++ b/kernel/x86_64/zsum_microk_skylakex-2.c @@ -0,0 +1,280 @@ +/* need a new enough GCC for avx512 support */ +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) + +#define HAVE_ZSUM_KERNEL 1 + +#include + +#include + +static FLOAT zsum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + + if (n2 < 32) { + __m128d accum_10, accum_11, accum_12, accum_13; + + accum_10 = _mm_setzero_pd(); + accum_11 = _mm_setzero_pd(); + accum_12 = _mm_setzero_pd(); + accum_13 = _mm_setzero_pd(); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + if (n2 >= 16){ + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + _mm_prefetch(&x1[8], _MM_HINT_T0); + __m128d x04 = _mm_loadu_pd(&x1[ 8]); + __m128d x05 = _mm_loadu_pd(&x1[10]); + __m128d x06 = _mm_loadu_pd(&x1[12]); + __m128d x07 = _mm_loadu_pd(&x1[14]); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + accum_10 = _mm_add_pd(accum_10, x04); + accum_11 = _mm_add_pd(accum_11, x05); + accum_12 = _mm_add_pd(accum_12, x06); + accum_13 = _mm_add_pd(accum_13, x07); + + x1 += 16; + n2 -= 16; + } + + if (n2 >= 8) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + accum_10 = _mm_add_pd(accum_10, x00); + } + + accum_10 = _mm_add_pd(accum_10, accum_11); + accum_12 = _mm_add_pd(accum_12, accum_13); + accum_10 = _mm_add_pd(accum_10, accum_12); + + accum_10 = _mm_hadd_pd(accum_10, accum_10); + + sumf = accum_10[0]; + } + else { + __m512d accum_0, accum_1, accum_2, accum_3; + __m512d x00, x01, x02, x03, x04, x05, x06, x07; + __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff); + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 128) { + if (n2 >= 64) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x04 = _mm512_loadu_pd(&x1[32]); + x05 = _mm512_loadu_pd(&x1[40]); + x06 = _mm512_loadu_pd(&x1[48]); + x07 = _mm512_loadu_pd(&x1[56]); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_loadu_pd(&x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + } + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + // n2 >= 128, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; + + if (0 != align_header) { + unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x04 = _mm512_load_pd(&x1[32]); + x05 = _mm512_load_pd(&x1[40]); + x06 = _mm512_load_pd(&x1[48]); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + + while (n2 >= 64) { + accum_0 = _mm512_add_pd(accum_0, x00); + x00 = _mm512_load_pd(&x1[ 0]); + accum_1 = _mm512_add_pd(accum_1, x01); + x01 = _mm512_load_pd(&x1[ 8]); + accum_2 = _mm512_add_pd(accum_2, x02); + x02 = _mm512_load_pd(&x1[16]); + accum_3 = _mm512_add_pd(accum_3, x03); + x03 = _mm512_load_pd(&x1[24]); + + accum_0 = _mm512_add_pd(accum_0, x04); + x04 = _mm512_load_pd(&x1[32]); + accum_1 = _mm512_add_pd(accum_1, x05); + x05 = _mm512_load_pd(&x1[40]); + accum_2 = _mm512_add_pd(accum_2, x06); + x06 = _mm512_load_pd(&x1[48]); + accum_3 = _mm512_add_pd(accum_3, x07); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + } + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + if (n2 >= 32) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_load_pd(&x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + } + + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + } + + return sumf; +} +#endif +#endif diff --git a/kernel/x86_64/zsum_sse.S b/kernel/x86_64/zsum_sse.S new file mode 100644 index 000000000..b679b42b0 --- /dev/null +++ b/kernel/x86_64/zsum_sse.S @@ -0,0 +1,299 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L100 + + subq $-32 * SIZE, X + addq M, M + + cmpq $3, M + jle .L18 + + testq $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + testq $8, X + je .L10 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + addq $2 * SIZE, X + subq $2, M + jle .L998 + ALIGN_3 + +.L10: + movq M, I + sarq $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + decq I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_3 + +.L12: + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + addps %xmm6, %xmm2 + addps %xmm7, %xmm3 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm1 + + addps %xmm10, %xmm2 + addps %xmm11, %xmm3 + + addq $32 * SIZE, X + ALIGN_3 + +.L14: + testq $31, M + jle .L998 + +.L15: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -20 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm6 + addps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(X), %xmm7 + addps %xmm7, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm6 + addps %xmm6, %xmm2 + jmp .L998 + ALIGN_4 + +.L100: + movq M, I + sarq $2, I + jle .L105 + ALIGN_4 + +.L101: + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + + addps %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + + addps %xmm5, %xmm1 + + decq I + jg .L101 + ALIGN_4 + +.L105: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andq $3, M + jle .L998 + ALIGN_4 + +.L106: + movsd (X), %xmm4 + addps %xmm4, %xmm0 + addq INCX, X + decq M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zsum_sse2.S b/kernel/x86_64/zsum_sse2.S new file mode 100644 index 000000000..6f667164d --- /dev/null +++ b/kernel/x86_64/zsum_sse2.S @@ -0,0 +1,283 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L40 + + subq $-16 * SIZE, X + addq M, M + + testq $SIZE, X + je .L05 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -16 * SIZE(X), %xmm0 + addq $SIZE, X + + subq $1, M + jle .L999 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + ALIGN_3 + +.L20: + andq $15, M + jle .L998 + + testq $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L21: + testq $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L22: + testq $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm6 + addpd %xmm6, %xmm3 + addq $2 * SIZE, X + +.L23: + testq $1, M + je .L998 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetcht0 PREFETCHSIZE * SIZE(X) +#endif + +#ifdef PENTIUM4 + prefetchnta PREFETCHSIZE * SIZE(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm7, %xmm3 + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + addpd %xmm4, %xmm0 + addq INCX, X + decq M + jg .L61 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index 3bcc32197..fe82d81e6 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -35,10 +35,10 @@ DASUMKERNEL = dasum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = zasum.c -SSUMKERNEL = ../arm/asum.c -DSUMKERNEL = dasum.c -CSUMKERNEL = ../arm/zasum.c -ZSUMKERNEL = zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = dsum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = zsum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index d39b8447e..4160a1a76 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -233,9 +233,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n1) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; + if (isnan(x[i + inc_x]) || isinf(x[i + inc_x])) + temp1 = NAN; + else temp1 = -da_i * x[i + 1 + inc_x]; x[i + 1 + inc_x] = da_i * x[i + inc_x]; x[i + inc_x] = temp1; @@ -246,7 +252,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; i += inc_x; @@ -320,7 +329,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, alpha, x); + zscal_kernel_8(n1, da_r, da_i, x); else if (da_i == 0) zscal_kernel_8_zero_i(n1, alpha, x); else @@ -347,7 +356,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; i += 2; diff --git a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake index 585ca26e7..17c0df80e 100644 --- a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake +++ b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake @@ -1,4 +1,4 @@ -# This module perdorms several try-compiles to determine the default integer +# This module performs several try-compiles to determine the default integer # size being used by the fortran compiler # # After execution, the following variables are set. If they are un set then diff --git a/lapack-netlib/CMAKE/Findcodecov.cmake b/lapack-netlib/CMAKE/Findcodecov.cmake index 384064007..93db45130 100644 --- a/lapack-netlib/CMAKE/Findcodecov.cmake +++ b/lapack-netlib/CMAKE/Findcodecov.cmake @@ -36,7 +36,7 @@ function(add_coverage TNAME) endfunction() -# Find the reuired flags foreach language. +# Find the required flags foreach language. set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET}) set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY}) @@ -118,7 +118,7 @@ function (codecov_path_of_source FILE RETURN_VAR) # If expression was found, SOURCEFILE is a generator-expression for an # object library. Currently we found no way to call this function automatic - # for the referenced target, so it must be called in the directoryso of the + # for the referenced target, so it must be called in the directory of the # object library definition. if(NOT "${_source}" STREQUAL "") set(${RETURN_VAR} "" PARENT_SCOPE) diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index b704e72c5..f1f47ae24 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -1,16 +1,20 @@ -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 3.6) -project(LAPACK Fortran C) +project(LAPACK) set(LAPACK_MAJOR_VERSION 3) -set(LAPACK_MINOR_VERSION 9) +set(LAPACK_MINOR_VERSION 12) set(LAPACK_PATCH_VERSION 0) set( LAPACK_VERSION ${LAPACK_MAJOR_VERSION}.${LAPACK_MINOR_VERSION}.${LAPACK_PATCH_VERSION} ) -# Add the CMake directory for custon CMake modules +# Allow setting a prefix for the library names +set(CMAKE_STATIC_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}") +set(CMAKE_SHARED_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}") + +# Add the CMake directory for custom CMake modules set(CMAKE_MODULE_PATH "${LAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH}) # Export all symbols on Windows when building shared libraries @@ -41,6 +45,48 @@ if(_is_coverage_build) find_package(codecov) endif() +# Use valgrind if it is found +option( LAPACK_TESTING_USE_PYTHON "Use Python for testing. Disable it on memory checks." ON ) +find_program( MEMORYCHECK_COMMAND valgrind ) +if( MEMORYCHECK_COMMAND ) + message( STATUS "Found valgrind: ${MEMORYCHECK_COMMAND}" ) + set( MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --show-leak-kinds=all --track-origins=yes" ) +endif() + +# By default test Fortran compiler complex abs and complex division +option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF) +if( TEST_FORTRAN_COMPILER ) + + add_executable( test_zcomplexabs ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f ) + add_custom_target( run_test_zcomplexabs + COMMAND test_zcomplexabs 2> test_zcomplexabs.err + WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL + COMMENT "Running test_zcomplexabs in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexabs.err" + SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f ) + + add_executable( test_zcomplexdiv ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f ) + add_custom_target( run_test_zcomplexdiv + COMMAND test_zcomplexdiv 2> test_zcomplexdiv.err + WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL + COMMENT "Running test_zcomplexdiv in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexdiv.err" + SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f ) + + add_executable( test_zcomplexmult ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f ) + add_custom_target( run_test_zcomplexmult + COMMAND test_zcomplexmult 2> test_zcomplexmult.err + WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL + COMMENT "Running test_zcomplexmult in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexmult.err" + SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f ) + + add_executable( test_zminMax ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f ) + add_custom_target( run_test_zminMax + COMMAND test_zminMax 2> test_zminMax.err + WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL + COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err" + SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f ) + +endif() + # By default static library option(BUILD_SHARED_LIBS "Build shared libraries" OFF) @@ -61,6 +107,8 @@ else() set(LAPACKELIB "lapacke") set(TMGLIB "tmglib") endif() +# By default build standard API and extended _64 API +option(BUILD_INDEX64_EXT_API "Build Index-64 API as extended API with _64 suffix" ON) include(GNUInstallDirs) @@ -89,46 +137,23 @@ configure_file( include(PreventInSourceBuilds) include(PreventInBuildInstalls) -if(UNIX) - if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") - endif() - if(CMAKE_Fortran_COMPILER_ID STREQUAL XL) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none") - endif() -# Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. -# This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin - string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}") -endif() - -if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq) - if(WIN32) - if(CMAKE_GENERATOR STREQUAL "NMake Makefiles") - get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE) - message(STATUS "Using Compaq Fortran compiler with command name ${CMAKE_Fortran_COMPILER_CMDNAM}") - set(cmd ${CMAKE_Fortran_COMPILER_CMDNAM}) - string(TOLOWER "${cmd}" cmdlc) - if(cmdlc STREQUAL "df") - message(STATUS "Assume the Compaq Visual Fortran Compiler is being used") - set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) - set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_INCLUDES 1) - #This is a workaround that is needed to avoid forward-slashes in the - #filenames listed in response files from incorrectly being interpreted as - #introducing compiler command options - if(${BUILD_SHARED_LIBS}) - message(FATAL_ERROR "Making of shared libraries with CVF has not been tested.") - endif() - set(str "NMake version 9 or later should be used. NMake version 6.0 which is\n") - set(str "${str} included with the CVF distribution fails to build Lapack because\n") - set(str "${str} the number of source files exceeds the limit for NMake v6.0\n") - message(STATUS ${str}) - set(CMAKE_Fortran_LINK_EXECUTABLE "LINK /out: ") - endif() +# Add option to enable flat namespace for symbol resolution on macOS +if(APPLE) + option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF) + + if(USE_FLAT_NAMESPACE) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-flat_namespace") + set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-flat_namespace") + else() + if(BUILD_SHARED_LIBS AND BUILD_TESTING) + message(WARNING + "LAPACK test suite might fail with shared libraries and the default two-level namespace. " + "Disable shared libraries or enable flat namespace for symbol resolution via -DUSE_FLAT_NAMESPACE=ON.") endif() endif() endif() - # -------------------------------------------------- set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKLIB}-targets) @@ -149,13 +174,14 @@ option(BUILD_TESTING "Build tests" ${_is_coverage_build}) include(CTest) message(STATUS "Build tests: ${BUILD_TESTING}") -# lapack_testing.py uses features from python 2.7 and greater if(BUILD_TESTING) - set(_msg "Looking for Python >= 2.7 needed for summary tests") + set(_msg "Looking for Python3 needed for summary tests") message(STATUS "${_msg}") - find_package(PythonInterp 2.7 QUIET) - if(PYTHONINTERP_FOUND) - message(STATUS "${_msg} - found (${PYTHON_VERSION_STRING})") + # find_package(PythonInterp 3) cannot be used because /usr/bin/python may be + # a Python2 interpreter. + find_program(PYTHON_EXECUTABLE python3) + if(PYTHON_EXECUTABLE) + message(STATUS "${_msg} - found") else() message(STATUS "${_msg} - not found (skipping summary tests)") endif() @@ -168,26 +194,6 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/bin) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib) -# -------------------------------------------------- -# Check for any necessary platform specific compiler flags -include(CheckLAPACKCompilerFlags) -CheckLAPACKCompilerFlags() - -# -------------------------------------------------- -# Check second function - -include(CheckTimeFunction) -set(TIME_FUNC NONE ${TIME_FUNC}) -CHECK_TIME_FUNCTION(NONE TIME_FUNC) -CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC) -CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC) -CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC) -CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC) -message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.") - -set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f) -set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f) - # deprecated LAPACK and LAPACKE routines option(BUILD_DEPRECATED "Build deprecated routines" OFF) message(STATUS "Build deprecated routines: ${BUILD_DEPRECATED}") @@ -210,6 +216,7 @@ if(NOT (BUILD_SINGLE OR BUILD_DOUBLE OR BUILD_COMPLEX OR BUILD_COMPLEX16)) BUILD_SINGLE, BUILD_DOUBLE, BUILD_COMPLEX, BUILD_COMPLEX16.") endif() + # -------------------------------------------------- # Subdirectories that need to be processed option(USE_OPTIMIZED_BLAS "Whether or not to use an optimized BLAS library instead of included netlib BLAS" OFF) @@ -279,18 +286,27 @@ endif() # Check the usage of the user provided or automatically found LAPACK libraries if(LAPACK_LIBRARIES) - include(CheckFortranFunctionExists) - set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) - # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES - CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND) - unset(CMAKE_REQUIRED_LIBRARIES) - if(LATESTLAPACK_FOUND) - message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.") + include(CheckLanguage) + check_language(Fortran) + if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) + include(CheckFortranFunctionExists) + set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) + # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES + CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND) + unset(CMAKE_REQUIRED_LIBRARIES) + if(LATESTLAPACK_FOUND) + message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.") + else() + message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.") + message(ERROR "--> Will use REFERENCE LAPACK (by default)") + message(ERROR "--> Or Correct your LAPACK_LIBRARIES entry ") + message(ERROR "--> Or Consider checking USE_OPTIMIZED_LAPACK") + endif() else() - message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.") - message(ERROR "--> Will use REFERENCE LAPACK (by default)") - message(ERROR "--> Or Correct your LAPACK_LIBRARIES entry ") - message(ERROR "--> Or Consider checking USE_OPTIMIZED_LAPACK") + message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.") + message(STATUS "--> CMake couldn't find a Fortran compiler, so it cannot check if the provided LAPACK library works.") + set(LATESTLAPACK_FOUND TRUE) endif() endif() @@ -298,6 +314,27 @@ endif() if(NOT LATESTLAPACK_FOUND) message(STATUS "Using supplied NETLIB LAPACK implementation") set(LAPACK_LIBRARIES ${LAPACKLIB}) + + enable_language(Fortran) + + # Check for any necessary platform specific compiler flags + include(CheckLAPACKCompilerFlags) + CheckLAPACKCompilerFlags() + + # Check second function + include(CheckTimeFunction) + set(TIME_FUNC NONE) + CHECK_TIME_FUNCTION(NONE TIME_FUNC) + CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC) + CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC) + CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC) + CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC) + + # Set second function + message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.") + set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f) + set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f) + add_subdirectory(SRC) else() set(CMAKE_EXE_LINKER_FLAGS @@ -325,35 +362,88 @@ option(LAPACKE_WITH_TMG "Build LAPACKE with tmglib routines" OFF) if(LAPACKE_WITH_TMG) set(LAPACKE ON) endif() -if(BUILD_TESTING OR LAPACKE_WITH_TMG) #already included, avoid double inclusion + +# TMGLIB +# Cache export target +set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME}) +if(BUILD_TESTING OR LAPACKE_WITH_TMG) + enable_language(Fortran) + if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG) + set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) + # Check if dlatms (part of tmg) is found + include(CheckFortranFunctionExists) + CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND) + unset(CMAKE_REQUIRED_LIBRARIES) + if(NOT LAPACK_WITH_TMGLIB_FOUND) + # Build and install TMG as part of LAPACKE targets (as opposed to LAPACK + # targets) + set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKELIB}-targets) + endif() + endif() add_subdirectory(TESTING/MATGEN) endif() +# Reset export target +set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE}) +unset(LAPACK_INSTALL_EXPORT_NAME_CACHE) + + +#------------------------------------- +# LAPACKE +# Include lapack.h and lapacke_mangling.h even if LAPACKE is not built +add_subdirectory(LAPACKE/include) if(LAPACKE) add_subdirectory(LAPACKE) endif() + #------------------------------------- # BLAS++ / LAPACK++ option(BLAS++ "Build BLAS++" OFF) option(LAPACK++ "Build LAPACK++" OFF) - - + + function(_display_cpp_implementation_msg name) string(TOLOWER ${name} name_lc) message(STATUS "${name}++ enable") message(STATUS "----------------") message(STATUS "Thank you for your interest in ${name}++, a newly developed C++ API for ${name} library") message(STATUS "The objective of ${name}++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc.") - message(STATUS "We are still working on integrating ${name}++ in our library. For the moment, you can download directly ${name_lc}++ from https://bitbucket.org/icl/${name_lc}pp") message(STATUS "For support ${name}++ related question, please email: slate-user@icl.utk.edu") message(STATUS "----------------") endfunction() -if(BLAS++) +if (BLAS++) _display_cpp_implementation_msg("BLAS") + include(ExternalProject) + ExternalProject_Add(blaspp + URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz + CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp + BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . + ) + ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES}) endif() -if(LAPACK++) +if (LAPACK++) + message (STATUS "linking lapack++ against ${LAPACK_LIBRARIES}") _display_cpp_implementation_msg("LAPACK") + include(ExternalProject) + if (BUILD_SHARED_LIBS) + ExternalProject_Add(lapackpp + URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz + CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp + BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . + ) + else () +# FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files + ExternalProject_Add(lapackpp + URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz + CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp + BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . + ) + endif() + ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) endif() # -------------------------------------------------- @@ -370,7 +460,7 @@ set(CPACK_MONOLITHIC_INSTALL ON) set(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK") if(WIN32 AND NOT UNIX) # There is a bug in NSI that does not handle full unix paths properly. Make - # sure there is at least one set of four (4) backlasshes. + # sure there is at least one set of four (4) backslashes. set(CPACK_NSIS_HELP_LINK "http:\\\\\\\\http://icl.cs.utk.edu/lapack-forum") set(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\www.netlib.org/lapack") set(CPACK_NSIS_CONTACT "lapack@eecs.utk.edu") @@ -396,10 +486,6 @@ if(NOT LATESTLAPACK_FOUND) set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKLIB}) endif() -if(BUILD_TESTING OR LAPACKE_WITH_TMG) - set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB}) -endif() - # Export lapack targets, not including lapacke, from the # install tree, if any. set(_lapack_config_install_guard_target "") @@ -424,6 +510,10 @@ if(LAPACKE) set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKELIB}) endif() +if(NOT LAPACK_WITH_TMGLIB_FOUND AND LAPACKE_WITH_TMG) + set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB}) +endif() + # Export lapack and lapacke targets from the build tree, if any. set(_lapack_config_build_guard_target "") if(ALL_TARGETS) @@ -461,4 +551,142 @@ install(FILES DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION} COMPONENT Development ) - +if (LAPACK++) + install( + DIRECTORY "${LAPACK_BINARY_DIR}/lib/" + DESTINATION ${CMAKE_INSTALL_LIBDIR} + FILES_MATCHING REGEX "liblapackpp.(a|so)$" + ) + install( + DIRECTORY "${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp/include/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + FILES_MATCHING REGEX "\\.(h|hh)$" + ) + write_basic_package_version_file( + "lapackppConfigVersion.cmake" + VERSION 2020.10.02 + COMPATIBILITY AnyNewerVersion + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/" + ) + +endif() +if (BLAS++) + write_basic_package_version_file( + "blasppConfigVersion.cmake" + VERSION 2020.10.02 + COMPATIBILITY AnyNewerVersion + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/" + ) + install( + DIRECTORY "${LAPACK_BINARY_DIR}/lib/" + DESTINATION ${CMAKE_INSTALL_LIBDIR} + FILES_MATCHING REGEX "libblaspp.(a|so)$" + ) + install( + DIRECTORY "${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp/include/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + FILES_MATCHING REGEX "\\.(h|hh)$" + ) +endif() + +# -------------------------------------------------- +# Generate MAN and/or HTML Documentation +option(BUILD_HTML_DOCUMENTATION "Create and install the HTML based API +documentation (requires Doxygen) - command: make html" OFF) +option(BUILD_MAN_DOCUMENTATION "Create and install the MAN based documentation (requires Doxygen) - command: make man" OFF) +message(STATUS "Build html documentation: ${BUILD_HTML_DOCUMENTATION}") +message(STATUS "Build man documentation: ${BUILD_MAN_DOCUMENTATION}") + +if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION) + find_package(Doxygen) + if(NOT DOXYGEN_FOUND) + message(WARNING "Doxygen is needed to build the documentation.") + + else() + + set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage") + set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION}) + set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS) + set(DOXYGEN_PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png) + set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES) + set(DOXYGEN_SOURCE_BROWSER YES) + set(DOXYGEN_CREATE_SUBDIRS YES) + set(DOXYGEN_SEPARATE_MEMBER_PAGES YES) + set(DOXYGEN_EXTRACT_ALL YES) + set(DOXYGEN_FILE_PATTERNS *.f *.f90 *.c *.h ) + set(DOXYGEN_RECURSIVE YES) + set(DOXYGEN_GENERATE_TREEVIEW YES) + set(DOXYGEN_DOT_IMAGE_FORMAT svg) + set(DOXYGEN_INTERACTIVE_SVG YES) + set(DOXYGEN_QUIET NO) + set(DOXYGEN_WARNINGS YES) + set(DOXYGEN_WARN_NO_PARAMDOC YES) + set(DOXYGEN_WARN_LOGFILE doxygen_error) + set(DOXYGEN_GENERATE_HTML NO) + set(DOXYGEN_GENERATE_MAN NO) + set(DOXYGEN_LAYOUT_FILE "DOCS/DoxygenLayout.xml") + + # Exclude functions that are duplicated, creating conflicts. + set(DOXYGEN_EXCLUDE .git + .github + SRC/VARIANTS + BLAS/SRC/lsame.f + BLAS/SRC/xerbla.f + BLAS/SRC/xerbla_array.f + INSTALL/slamchf77.f + INSTALL/dlamchf77.f ) + + if (BUILD_HTML_DOCUMENTATION) + set(DOXYGEN_GENERATE_HTML YES) + set(DOXYGEN_HTML_OUTPUT explore-html) + set(DOXYGEN_INLINE_SOURCES YES) + set(DOXYGEN_CALL_GRAPH YES) + set(DOXYGEN_CALLER_GRAPH YES) + + doxygen_add_docs( + html + + # Doxygen INPUT = + ${PROJECT_SOURCE_DIR}/README.md + ${PROJECT_SOURCE_DIR}/BLAS + ${PROJECT_SOURCE_DIR}/CBLAS + ${PROJECT_SOURCE_DIR}/SRC + ${PROJECT_SOURCE_DIR}/INSTALL + ${PROJECT_SOURCE_DIR}/TESTING + ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox + + COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)" + ) + endif() + if (BUILD_MAN_DOCUMENTATION) + set(DOXYGEN_GENERATE_MAN YES) + set(DOXYGEN_MAN_LINKS YES) + set(DOXYGEN_INLINE_SOURCES NO) + set(DOXYGEN_CALL_GRAPH NO) + set(DOXYGEN_CALLER_GRAPH NO) + + doxygen_add_docs( + man + + # Doxygen INPUT = + ${PROJECT_SOURCE_DIR}/BLAS + ${PROJECT_SOURCE_DIR}/CBLAS + ${PROJECT_SOURCE_DIR}/SRC + ${PROJECT_SOURCE_DIR}/INSTALL + ${PROJECT_SOURCE_DIR}/TESTING + ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox + + COMMENT "Generating man LAPACK documentation" + ) + endif() + + endif() +endif() diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile index 43cea43b5..577675772 100644 --- a/lapack-netlib/DOCS/Doxyfile +++ b/lapack-netlib/DOCS/Doxyfile @@ -1,4 +1,4 @@ -# Doxyfile 1.8.10 +# Doxyfile 1.9.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -17,11 +17,11 @@ # Project related configuration options #--------------------------------------------------------------------------- -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -# for the list of possible encodings. +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 @@ -38,7 +38,7 @@ PROJECT_NAME = LAPACK # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 3.9.0 +PROJECT_NUMBER = 3.12.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES = NO OUTPUT_LANGUAGE = English +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. @@ -179,6 +187,16 @@ SHORT_NAMES = NO JAVADOC_AUTOBRIEF = NO +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus @@ -199,6 +217,14 @@ QT_AUTOBRIEF = NO MULTILINE_CPP_IS_BRIEF = NO +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. @@ -226,16 +252,15 @@ TAB_SIZE = 8 # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines. +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) ALIASES = -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all @@ -264,28 +289,40 @@ OPTIMIZE_FOR_FORTRAN = YES OPTIMIZE_OUTPUT_VHDL = NO +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. +# documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. @@ -293,6 +330,15 @@ EXTENSION_MAPPING = MARKDOWN_SUPPORT = YES +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or @@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT = NO CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. @@ -341,7 +387,7 @@ IDL_PROPERTY_SUPPORT = YES # all members of a group must be documented explicitly. # The default value is: NO. -DISTRIBUTE_GROUP_DOC = YES +DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option @@ -404,6 +450,19 @@ TYPEDEF_HIDES_STRUCT = NO LOOKUP_CACHE_SIZE = 0 +# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which efficively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- @@ -424,6 +483,12 @@ EXTRACT_ALL = YES EXTRACT_PRIVATE = NO +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. @@ -461,6 +526,13 @@ EXTRACT_LOCAL_METHODS = NO EXTRACT_ANON_NSPACES = NO +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation @@ -478,8 +550,8 @@ HIDE_UNDOC_MEMBERS = NO HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. +# declarations. If set to NO, these declarations will be included in the +# documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO @@ -498,11 +570,18 @@ HIDE_IN_BODY_DOCS = NO INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. # The default value is: system dependent. CASE_SENSE_NAMES = NO @@ -684,12 +763,12 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = DOCS/DoxygenLayout.xml # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. @@ -705,7 +784,7 @@ CITE_BIB_FILES = # messages are off. # The default value is: NO. -QUIET = YES +QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES @@ -734,10 +813,20 @@ WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. # The default value is: NO. -WARN_NO_PARAMDOC = NO +WARN_NO_PARAMDOC = YES + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# Possible values are: NO, YES and FAIL_ON_WARNINGS. +# The default value is: NO. + +WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which @@ -753,7 +842,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = output_err +WARN_LOGFILE = doxygen_error #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -762,17 +851,18 @@ WARN_LOGFILE = output_err # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with -# spaces. +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = . \ - DOCS/groups-usr.dox +INPUT = BLAS CBLAS SRC INSTALL TESTING \ + DOCS/groups-usr.dox \ + README.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: http://www.gnu.org/software/libiconv) for the list of -# possible encodings. +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 @@ -785,14 +875,19 @@ INPUT_ENCODING = UTF-8 # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, -# *.vhdl, *.ucf, *.qsf, *.as and *.js. +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl, +# *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.f \ + *.f90 \ *.h # The RECURSIVE tag can be used to specify whether or not subdirectories should @@ -808,34 +903,15 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = CMAKE \ - DOCS \ - .svn \ - CBLAS/.svn \ - CBLAS/src/.svn \ - CBLAS/testing/.svn \ - CBLAS/example/.svn \ - CBLAS/include/.svn \ - BLAS/.svn \ - BLAS/SRC/.svn \ - BLAS/TESTING/.svn \ - SRC/.svn \ - SRC/VARIANTS/.svn \ - SRC/VARIANTS/LIB/.svn \ - SRC/VARIANTS/cholesky/.svn \ - SRC/VARIANTS/cholesky/RL/.svn \ - SRC/VARIANTS/cholesky/TOP/.svn \ - SRC/VARIANTS/lu/.svn \ - SRC/VARIANTS/lu/CR/.svn \ - SRC/VARIANTS/lu/LL/.svn \ - SRC/VARIANTS/lu/REC/.svn \ - SRC/VARIANTS/qr/.svn \ - SRC/VARIANTS/qr/LL/.svn \ - INSTALL/.svn \ - TESTING/.svn \ - TESTING/EIG/.svn \ - TESTING/MATGEN/.svn \ - TESTING/LIN/.svn +# Exclude functions that are duplicated, creating conflicts. +EXCLUDE = .git \ + .github \ + SRC/VARIANTS \ + BLAS/SRC/lsame.f \ + BLAS/SRC/xerbla.f \ + BLAS/SRC/xerbla_array.f \ + INSTALL/slamchf77.f \ + INSTALL/dlamchf77.f \ # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -908,6 +984,10 @@ IMAGE_PATH = # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. INPUT_FILTER = @@ -917,6 +997,10 @@ INPUT_FILTER = # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. FILTER_PATTERNS = @@ -969,7 +1053,7 @@ INLINE_SOURCES = YES STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. +# entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO @@ -1001,12 +1085,12 @@ SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system -# (see http://www.gnu.org/software/global/global.html). You will need version +# (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # @@ -1028,25 +1112,6 @@ USE_HTAGS = NO VERBATIM_HEADERS = YES -# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. -# Note: The availability of this option depends on whether or not doxygen was -# compiled with the --with-libclang option. -# The default value is: NO. - -CLANG_ASSISTED_PARSING = NO - -# If clang assisted parsing is enabled you can provide the compiler with command -# line options that you would normally use when invoking the compiler. Note that -# the include paths will already be set by doxygen for the files and directories -# specified with INPUT and INCLUDE_PATH. -# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. - -CLANG_OPTIONS = - #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- @@ -1058,13 +1123,6 @@ CLANG_OPTIONS = ALPHABETICAL_INDEX = YES -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored @@ -1165,7 +1223,7 @@ HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. @@ -1201,6 +1259,17 @@ HTML_COLORSTYLE_GAMMA = 80 HTML_TIMESTAMP = YES +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. @@ -1224,13 +1293,14 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: http://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1269,8 +1339,8 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. +# (see: +# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML @@ -1300,7 +1370,7 @@ CHM_FILE = HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). +# (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. @@ -1345,7 +1415,8 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1353,8 +1424,8 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- -# folders). +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1362,30 +1433,30 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = @@ -1462,6 +1533,17 @@ TREEVIEW_WIDTH = 250 EXT_LINKS_IN_WINDOW = NO +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML @@ -1471,7 +1553,7 @@ EXT_LINKS_IN_WINDOW = NO FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # @@ -1482,8 +1564,14 @@ FORMULA_FONTSIZE = 10 FORMULA_TRANSPARENT = YES +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# http://www.mathjax.org) which uses client side Javascript for the rendering +# https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path @@ -1495,7 +1583,7 @@ USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. @@ -1510,8 +1598,8 @@ MATHJAX_FORMAT = HTML-CSS # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of -# MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://www.mathjax.org/mathjax @@ -1525,7 +1613,8 @@ MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1553,7 +1642,7 @@ MATHJAX_CODEFILE = SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a web server instead of a web client using Javascript. There +# implemented using a web server instead of a web client using JavaScript. There # are two flavors of web server based searching depending on the EXTERNAL_SEARCH # setting. When disabled, doxygen will generate a PHP script for searching and # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing @@ -1572,7 +1661,8 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). +# Xapian (see: +# https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1585,8 +1675,9 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). See the section "External Indexing and -# Searching" for details. +# Xapian (see: +# https://xapian.org/). See the section "External Indexing and Searching" for +# details. # This tag requires that the tag SEARCHENGINE is set to YES. SEARCHENGINE_URL = @@ -1637,21 +1728,35 @@ LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. # -# Note that when enabling USE_PDFLATEX this option is only used for generating -# bitmaps for formulas in the HTML output, but not in the Makefile that is -# written to the output directory. -# The default file is: latex. +# Note that when not enabling USE_PDFLATEX the default is latex when enabling +# USE_PDFLATEX the default is pdflatex and when in the later case latex is +# chosen this is overwritten by pdflatex. For specific output languages the +# default can have been set differently, this depends on the implementation of +# the output language. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate # index for LaTeX. +# Note: This tag is used in the Makefile / make.bat. +# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file +# (.tex). # The default file is: makeindex. # This tag requires that the tag GENERATE_LATEX is set to YES. MAKEINDEX_CMD_NAME = makeindex +# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to +# generate index for LaTeX. In case there is no backslash (\) as first character +# it will be automatically added in the LaTeX code. +# Note: This tag is used in the generated output file (.tex). +# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat. +# The default value is: makeindex. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_MAKEINDEX_CMD = makeindex + # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX # documents. This may be useful for small projects and may help to save some # trees in general. @@ -1736,9 +1841,11 @@ LATEX_EXTRA_FILES = PDF_HYPERLINKS = YES -# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate -# the PDF file directly from the LaTeX files. Set this option to YES, to get a -# higher quality PDF documentation. +# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as +# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX +# files. Set this option to YES, to get a higher quality PDF documentation. +# +# See also section LATEX_CMD_NAME for selecting the engine. # The default value is: YES. # This tag requires that the tag GENERATE_LATEX is set to YES. @@ -1772,12 +1879,28 @@ LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See -# http://en.wikipedia.org/wiki/BibTeX and \cite for more info. +# https://en.wikipedia.org/wiki/BibTeX and \cite for more info. # The default value is: plain. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_BIB_STYLE = plain +# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated +# page will contain the date and time when the page was generated. Setting this +# to NO can help when comparing the output of multiple runs. +# The default value is: NO. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_TIMESTAMP = NO + +# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) +# path from which the emoji images will be read. If a relative path is entered, +# it will be relative to the LATEX_OUTPUT directory. If left blank the +# LATEX_OUTPUT directory will be used. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_EMOJI_DIRECTORY = + #--------------------------------------------------------------------------- # Configuration options related to the RTF output #--------------------------------------------------------------------------- @@ -1817,9 +1940,9 @@ COMPACT_RTF = NO RTF_HYPERLINKS = YES -# Load stylesheet definitions from file. Syntax is similar to doxygen's config -# file, i.e. a series of assignments. You only have to provide replacements, -# missing definitions are set to their default value. +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# configuration file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. # # See also section "Doxygen usage" for information on how to generate the # default style sheet that doxygen normally uses. @@ -1828,8 +1951,8 @@ RTF_HYPERLINKS = YES RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is -# similar to doxygen's config file. A template extensions file can be generated -# using doxygen -e rtf extensionFile. +# similar to doxygen's configuration file. A template extensions file can be +# generated using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. RTF_EXTENSIONS_FILE = @@ -1915,6 +2038,13 @@ XML_OUTPUT = xml XML_PROGRAMLISTING = YES +# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include +# namespace members in file scope as well, matching the HTML output. +# The default value is: NO. +# This tag requires that the tag GENERATE_XML is set to YES. + +XML_NS_MEMB_FILE_SCOPE = NO + #--------------------------------------------------------------------------- # Configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- @@ -1947,9 +2077,9 @@ DOCBOOK_PROGRAMLISTING = NO #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sf.net) file that captures the -# structure of the code including all documentation. Note that this feature is -# still experimental and incomplete at the moment. +# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# the structure of the code including all documentation. Note that this feature +# is still experimental and incomplete at the moment. # The default value is: NO. GENERATE_AUTOGEN_DEF = NO @@ -2116,12 +2246,6 @@ EXTERNAL_GROUPS = YES EXTERNAL_PAGES = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of 'which perl'). -# The default file (with absolute path) is: /usr/bin/perl. - -PERL_PATH = /sw/bin/perl - #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- @@ -2135,15 +2259,6 @@ PERL_PATH = /sw/bin/perl CLASS_DIAGRAMS = YES -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see: -# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. @@ -2241,10 +2356,32 @@ UML_LOOK = NO # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. +# This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 +# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS +# tag is set to YES, doxygen will add type and arguments for attributes and +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen +# will not generate fields with class member information in the UML graphs. The +# class diagrams will look similar to the default class diagrams but using UML +# notation for the relationships. +# Possible values are: NO, YES and NONE. +# The default value is: NO. +# This tag requires that the tag UML_LOOK is set to YES. + +DOT_UML_DETAILS = NO + +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters +# to display on a single line. If the actual line length exceeds this threshold +# significantly it will wrapped across multiple lines. Some heuristics are apply +# to avoid ugly line breaks. +# Minimum value: 0, maximum value: 1000, default value: 17. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_WRAP_THRESHOLD = 17 + # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. @@ -2371,6 +2508,11 @@ DIAFILE_DIRS = PLANTUML_JAR_PATH = +# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a +# configuration file for plantuml. + +PLANTUML_CFG_FILE = + # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. @@ -2429,9 +2571,11 @@ DOT_MULTI_TARGETS = NO GENERATE_LEGEND = YES -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot +# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate # files that are used to generate the various graphs. +# +# Note: This setting is not only used for dot files but also for msc and +# plantuml temporary files. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. DOT_CLEANUP = YES diff --git a/lapack-netlib/DOCS/DoxygenLayout.xml b/lapack-netlib/DOCS/DoxygenLayout.xml new file mode 100644 index 000000000..aeb346d8d --- /dev/null +++ b/lapack-netlib/DOCS/DoxygenLayout.xml @@ -0,0 +1,197 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lapack-netlib/DOCS/groups-usr.dox b/lapack-netlib/DOCS/groups-usr.dox index 6c31e4cf5..cbd747165 100644 --- a/lapack-netlib/DOCS/groups-usr.dox +++ b/lapack-netlib/DOCS/groups-usr.dox @@ -1,605 +1,974 @@ /** - * - **** - * - * @defgroup lapack LAPACK - * This is the group of LAPACK routines. - * - * @defgroup GE General Matrices - * @ingroup lapack - * This is the group of General Matrices routines - * @defgroup GB General Band Matrix - * @ingroup lapack - * This is the group of General Band routines - * @defgroup SY Symmetric Matrix - * @ingroup lapack - * This is the group of Symmetric Matrix routines - * @defgroup HE Hermitian Matrix - * @ingroup lapack - * This is the group of Hermitian Matrix routines - * @defgroup PO Positive Definite Matrix - * @ingroup lapack - * This is the group of Positive Definite routines - * @defgroup GT General tridiagonal Matrix - * @ingroup lapack - * This is the group of General tridiagonal routines - * @defgroup PT Positive Definite tridiagonal Matrix - * @ingroup lapack - * This is the group of Positive Definite tridiagonal routines - * @defgroup OTHEReigen Eigenvalue - * @ingroup lapack - * This is the group of Other Eigenvalue routines - * @defgroup OTHERauxiliary Other Auxiliary Routines - * @ingroup lapack - * This is the group of Other Auxiliary routines - * @defgroup OTHERcomputational Other Computational Routines - * @ingroup lapack - * This is the group of Other Computational routines - * @defgroup OTHERsolve Other Solve Routines - * @ingroup lapack - * This is the group of Other Solve routines - * - **** - * - * @defgroup solveGE Linear Solve - * @ingroup GE - * This is the group of Linear Solve Driver routines - * @defgroup solveGB Linear Solve - * @ingroup GB - * This is the group of Linear Solve Driver routines - * @defgroup solveSY Linear Solve - * @ingroup SY - * This is the group of Linear Solve Driver routines - * @defgroup solveHE Linear Solve - * @ingroup HE - * This is the group of Linear Solve Driver routines - * @defgroup solvePO Linear Solve - * @ingroup PO - * This is the group of Linear Solve Driver routines - * @defgroup solveGT Linear Solve - * @ingroup GT - * This is the group of Linear Solve Driver routines - * @defgroup solvePT Linear Solve - * @ingroup PT - * This is the group of Linear Solve Driver routines - * @defgroup eigenGE Eigenvalue - * @ingroup GE - * This is the group of Eigenvalue Driver routines - * @defgroup eigenSY Eigenvalue - * @ingroup SY - * This is the group of Eigenvalue Driver routines - * @defgroup eigenHE Eigenvalue - * @ingroup HE - * This is the group of Eigenvalue Driver routines - * @defgroup singGE Singular Value - * @ingroup GE - * This is the group of Singular Value Driver routines - * @defgroup computationalGE Computational routines - * @ingroup GE - * This is the group of Computational routines - * @defgroup variantsGEcomputational Variants Computational routines - * @ingroup GE - * This is the group of Variants Computational routines - * @defgroup computationalGB Computational routines - * @ingroup GB - * This is the group of Computational routines - * @defgroup computationalSY Computational routines - * @ingroup SY - * This is the group of Computational routines - * @defgroup computationalHE Computational routines - * @ingroup HE - * This is the group of Computational routines - * @defgroup computationalPO Computational routines - * @ingroup PO - * This is the group of Computational routines - * @defgroup variantsPOcomputational Variants Computational routines - * @ingroup PO - * This is the group of Variants Computational routines - * @defgroup computationalGT Computational routines - * @ingroup GT - * This is the group of Computational routines - * @defgroup computationalPT Computational routines - * @ingroup PT - * This is the group of Computational routines - * @defgroup variantsOTHERcomputational Variants Computational routines - * @ingroup OTHERcomputational - * This is the group of Variants Computational routines - * @defgroup auxiliaryGE Auxiliary routines - * @ingroup GE - * This is the group of Auxiliary routines - * @defgroup auxiliaryGB Auxiliary routines - * @ingroup GB - * This is the group of Auxiliary routines - * @defgroup auxiliarySY Auxiliary routines - * @ingroup SY - * This is the group of Auxiliary routines - * @defgroup auxiliaryHE Auxiliary routines - * @ingroup HE - * This is the group of Auxiliary routines - * @defgroup auxiliaryPO Auxiliary routines - * @ingroup PO - * This is the group of Auxiliary routines - * @defgroup auxiliaryGT Auxiliary routines - * @ingroup GT - * This is the group of Auxiliary routines - * @defgroup auxiliaryPT Auxiliary routines - * @ingroup PT - * This is the group of Auxiliary routines - * - **** - * - * @defgroup doubleGEsolve double - * @ingroup solveGE - * This is the group of double solve driver functions for GE matrices - * @defgroup doublePOsolve double - * @ingroup solvePO - * This is the group of double solve driver functions for PO matrices - * @defgroup doubleSYsolve double - * @ingroup solveSY - * This is the group of double solve driver functions for SY matrices - * @defgroup doubleGBsolve double - * @ingroup solveGB - * This is the group of double solve driver functions for GB matrices - * @defgroup doubleGTsolve double - * @ingroup solveGT - * This is the group of double solve driver functions for GT matrices - * @defgroup doublePTsolve double - * @ingroup solvePT - * This is the group of double solve driver functions for PT matrices - * @defgroup doubleGEeigen double - * @ingroup eigenGE - * This is the group of double eigenvalue driver functions for GE matrices - * @defgroup doubleSYeigen double - * @ingroup eigenSY - * This is the group of double eigenvalue driver functions for SY matrices - * @defgroup doubleGEsing double - * @ingroup singGE - * This is the group of double singular value driver functions for GE matrices - * @defgroup doubleGEcomputational double - * @ingroup computationalGE - * This is the group of double computational functions for GE matrices - * @defgroup doublePOcomputational double - * @ingroup computationalPO - * This is the group of double computational functions for PO matrices - * @defgroup doubleSYcomputational double - * @ingroup computationalSY - * This is the group of double computational functions for SY matrices - * @defgroup doubleGBcomputational double - * @ingroup computationalGB - * This is the group of double computational functions for GB matrices - * @defgroup doubleGTcomputational double - * @ingroup computationalGT - * This is the group of double computational functions for GT matrices - * @defgroup doublePTcomputational double - * @ingroup computationalPT - * This is the group of double computational functions for PT matrices - * @defgroup doubleGEauxiliary double - * @ingroup auxiliaryGE - * This is the group of double auxiliary functions for GE matrices - * @defgroup doublePOauxiliary double - * @ingroup auxiliaryPO - * This is the group of double auxiliary functions for PO matrices - * @defgroup doubleSYauxiliary double - * @ingroup auxiliarySY - * This is the group of double auxiliary functions for SY matrices - * @defgroup doubleGBauxiliary double - * @ingroup auxiliaryGB - * This is the group of double auxiliary functions for GB matrices - * @defgroup doublePTauxiliary double - * @ingroup auxiliaryPT - * This is the group of double auxiliary functions for PT matrices - * @defgroup doubleGTauxiliary double - * @ingroup auxiliaryGT - * This is the group of double auxiliary functions for GT matrices - * @defgroup doubleOTHERauxiliary double - * @ingroup OTHERauxiliary - * This is the group of double other auxiliary routines - * @defgroup doubleOTHERcomputational double - * @ingroup OTHERcomputational - * This is the group of double other Computational routines - * @defgroup doubleOTHERsolve double Other Solve Routines - * @ingroup OTHERsolve - * This is the group of double Other Solve routines - * @defgroup doubleOTHEReigen double - * @ingroup OTHEReigen - * This is the group of double Other Eigenvalue routines - * - **** - * - * @defgroup realGEsolve real - * @ingroup solveGE - * This is the group of real solve driver functions for GE matrices - * @defgroup realPOsolve real - * @ingroup solvePO - * This is the group of real solve driver functions for PO matrices - * @defgroup realSYsolve real - * @ingroup solveSY - * This is the group of real solve driver functions for SY matrices - * @defgroup realGBsolve real - * @ingroup solveGB - * This is the group of real solve driver functions for GB matrices - * @defgroup realGTsolve real - * @ingroup solveGT - * This is the group of real solve driver functions for GT matrices - * @defgroup realPTsolve real - * @ingroup solvePT - * This is the group of real solve driver functions for PT matrices - * @defgroup realGEeigen real - * @ingroup eigenGE - * This is the group of real eigenvalue driver functions for GE matrices - * @defgroup realSYeigen real - * @ingroup eigenSY - * This is the group of real eigenvalue driver functions for SY matrices - * @defgroup realGEsing real - * @ingroup singGE - * This is the group of real singular value driver functions for GE matrices - * @defgroup realGEcomputational real - * @ingroup computationalGE - * This is the group of real computational functions for GE matrices - * @defgroup realPOcomputational real - * @ingroup computationalPO - * This is the group of real computational functions for PO matrices - * @defgroup realSYcomputational real - * @ingroup computationalSY - * This is the group of real computational functions for SY matrices - * @defgroup realGBcomputational real - * @ingroup computationalGB - * This is the group of real computational functions for GB matrices - * @defgroup realPTcomputational real - * @ingroup computationalPT - * This is the group of real computational functions for PT matrices - * @defgroup realGTcomputational real - * @ingroup computationalGT - * This is the group of real computational functions for GT matrices - * @defgroup realGEauxiliary real - * @ingroup auxiliaryGE - * This is the group of real auxiliary functions for GE matrices - * @defgroup realPOauxiliary real - * @ingroup auxiliaryPO - * This is the group of real auxiliary functions for PO matrices - * @defgroup realSYauxiliary real - * @ingroup auxiliarySY - * This is the group of real auxiliary functions for SY matrices - * @defgroup realGBauxiliary real - * @ingroup auxiliaryGB - * This is the group of real auxiliary functions for GB matrices - * @defgroup realGTauxiliary real - * @ingroup auxiliaryGT - * This is the group of real auxiliary functions for GT matrices - * @defgroup realPTauxiliary real - * @ingroup auxiliaryPT - * This is the group of real auxiliary functions for PT matrices - * @defgroup realOTHERauxiliary real - * @ingroup OTHERauxiliary - * This is the group of real other auxiliary routines - * @defgroup realOTHERcomputational real - * @ingroup OTHERcomputational - * This is the group of real other Computational routines - * @defgroup realOTHERsolve real Other Solve Routines - * @ingroup OTHERsolve - * This is the group of real Other Solve routines - * @defgroup realOTHEReigen real - * @ingroup OTHEReigen - * This is the group of real Other Eigenvalue routines - * - **** - * - * @defgroup complexGEsolve complex - * @ingroup solveGE - * This is the group of complex solve driver functions for GE matrices - * @defgroup complexPOsolve complex - * @ingroup solvePO - * This is the group of complex solve driver functions for PO matrices - * @defgroup complexSYsolve complex - * @ingroup solveSY - * This is the group of complex solve driver functions for SY matrices - * @defgroup complexHEsolve complex - * @ingroup solveHE - * This is the group of complex solve driver functions for HE matrices - * @defgroup complexGBsolve complex - * @ingroup solveGB - * This is the group of complex solve driver functions for GB matrices - * @defgroup complexGTsolve complex - * @ingroup solveGT - * This is the group of complex solve driver functions for GT matrices - * @defgroup complexPTsolve complex - * @ingroup solvePT - * This is the group of complex solve driver functions for PT matrices - * @defgroup complexGEeigen complex - * @ingroup eigenGE - * This is the group of complex eigenvalue driver functions for GE matrices - * @defgroup complexSYeigen complex - * @ingroup eigenSY - * This is the group of complex eigenvalue driver functions for SY matrices - * @defgroup complexHEeigen complex - * @ingroup eigenHE - * This is the group of complex eigenvalue driver functions for HE matrices - * @defgroup complexGEsing complex - * @ingroup singGE - * This is the group of complex singular value driver functions for GE matrices - * @defgroup complexGEcomputational complex - * @ingroup computationalGE - * This is the group of complex computational functions for GE matrices - * @defgroup complexPOcomputational complex - * @ingroup computationalPO - * This is the group of complex computational functions for PO matrices - * @defgroup complexSYcomputational complex - * @ingroup computationalSY - * This is the group of complex computational functions for SY matrices - * @defgroup complexHEcomputational complex - * @ingroup computationalHE - * This is the group of complex computational functions for HE matrices - * @defgroup complexGBcomputational complex - * @ingroup computationalGB - * This is the group of complex computational functions for GB matrices - * @defgroup complexGTcomputational complex - * @ingroup computationalGT - * This is the group of complex computational functions for GT matrices - * @defgroup complexPTcomputational complex - * @ingroup computationalPT - * This is the group of complex computational functions for PT matrices - * @defgroup complexGEauxiliary complex - * @ingroup auxiliaryGE - * This is the group of complex auxiliary functions for GE matrices - * @defgroup complexPOauxiliary complex - * @ingroup auxiliaryPO - * This is the group of complex auxiliary functions for PO matrices - * @defgroup complexSYauxiliary complex - * @ingroup auxiliarySY - * This is the group of complex auxiliary functions for SY matrices - * @defgroup complexHEauxiliary complex - * @ingroup auxiliaryHE - * This is the group of complex auxiliary functions for HE matrices - * @defgroup complexGBauxiliary complex - * @ingroup auxiliaryGB - * This is the group of complex auxiliary functions for GB matrices - * @defgroup complexOTHERauxiliary complex - * @ingroup OTHERauxiliary - * This is the group of complex other auxiliary routines - * @defgroup complexOTHERcomputational complex - * @ingroup OTHERcomputational - * This is the group of complex other Computational routines - * @defgroup complexOTHERsolve complex Other Solve Routines - * @ingroup OTHERsolve - * This is the group of complex Other Solve routines - * @defgroup complexOTHEReigen complex Other Eigenvalue routines - * @ingroup OTHEReigen - * This is the group of complex Other Eigenvalue routines - * - **** - * - * @defgroup complex16GEsolve complex16 - * @ingroup solveGE - * This is the group of complex16 solve driver functions for GE matrices - * @defgroup complex16POsolve complex16 - * @ingroup solvePO - * This is the group of complex16 solve driver functions for PO matrices - * @defgroup complex16SYsolve complex16 - * @ingroup solveSY - * This is the group of complex16 solve driver functions for SY matrices - * @defgroup complex16HEsolve complex16 - * @ingroup solveHE - * This is the group of complex16 solve driver functions for HE matrices - * @defgroup complex16GBsolve complex16 - * @ingroup solveGB - * This is the group of complex16 solve driver functions for GB matrices - * @defgroup complex16GTsolve complex16 - * @ingroup solveGT - * This is the group of complex16 solve driver functions for GT matrices - * @defgroup complex16PTsolve complex16 - * @ingroup solvePT - * This is the group of complex16 solve driver functions for PT matrices - * @defgroup complex16GEeigen complex16 - * @ingroup eigenGE - * This is the group of complex16 eigenvalue driver functions for GE matrices - * @defgroup complex16SYeigen complex16 - * @ingroup eigenSY - * This is the group of complex16 eigenvalue driver functions for SY matrices - * @defgroup complex16HEeigen complex16 - * @ingroup eigenHE - * This is the group of complex16 eigenvalue driver functions for HE matrices - * @defgroup complex16GEsing complex16 - * @ingroup singGE - * This is the group of complex16 singular value driver functions for GE matrices - * @defgroup complex16GEcomputational complex16 - * @ingroup computationalGE - * This is the group of complex16 computational functions for GE matrices - * @defgroup complex16POcomputational complex16 - * @ingroup computationalPO - * This is the group of complex16 computational functions for PO matrices - * @defgroup complex16SYcomputational complex16 - * @ingroup computationalSY - * This is the group of complex16 computational functions for SY matrices - * @defgroup complex16HEcomputational complex16 - * @ingroup computationalHE - * This is the group of complex16 computational functions for HE matrices - * @defgroup complex16GBcomputational complex16 - * @ingroup computationalGB - * This is the group of complex16 computational functions for GB matrices - * @defgroup complex16GTcomputational complex16 - * @ingroup computationalGT - * This is the group of complex16 computational functions for GT matrices - * @defgroup complex16PTcomputational complex16 - * @ingroup computationalPT - * This is the group of complex16 computational functions for PT matrices - * @defgroup complex16GEauxiliary complex16 - * @ingroup auxiliaryGE - * This is the group of complex16 auxiliary functions for GE matrices - * @defgroup complex16POauxiliary complex16 - * @ingroup auxiliaryPO - * This is the group of complex16 auxiliary functions for PO matrices - * @defgroup complex16SYauxiliary complex16 - * @ingroup auxiliarySY - * This is the group of complex16 auxiliary functions for SY matrices - * @defgroup complex16HEauxiliary complex16 - * @ingroup auxiliaryHE - * This is the group of complex16 auxiliary functions for HE matrices - * @defgroup complex16GBauxiliary complex16 - * @ingroup auxiliaryGB - * This is the group of complex16 auxiliary functions for GB matrices - * @defgroup complex16OTHERcomputational complex16 - * @ingroup OTHERcomputational - * This is the group of complex16 other Computational routines - * @defgroup complex16OTHERauxiliary complex16 - * @ingroup OTHERauxiliary - * This is the group of complex16 other auxiliary routines - * @defgroup auxOTHERcomputational auxiliary Computational routines - * @ingroup OTHERcomputational - * This is the group of auxiliary Computational routines - * @defgroup complex16OTHERsolve complex16 Other Solve Routines - * @ingroup OTHERsolve - * This is the group of complex16 Other Solve routines - * @defgroup complex16OTHEReigen complex16 Other Eigenvalue routines - * @ingroup OTHEReigen - * This is the group of complex16 Other Eigenvalue routines - * - **** - * - * @defgroup testing LAPACK Testing - * This is the group of LAPACK TESTING routines. - * - * @defgroup matgen Matrix Generation - * @ingroup testing - * This is the group of LAPACK TESTING MATGEN routines. - * - * @defgroup lin Linear Solve - * @ingroup testing - * This is the group of LAPACK TESTING LIN routines. - * - * @defgroup eig Eigenvalue and Singular value - * @ingroup testing - * This is the group of LAPACK TESTING EIG routines. - * - * @defgroup real_matgen real - * @ingroup matgen - * This is the group of real LAPACK TESTING MATGEN routines. - * - * @defgroup double_matgen double - * @ingroup matgen - * This is the group of double LAPACK TESTING MATGEN routines. - * - * @defgroup complex_matgen complex - * @ingroup matgen - * This is the group of complex LAPACK TESTING MATGEN routines. - * - * @defgroup complex16_matgen complex16 - * @ingroup matgen - * This is the group of complex16 LAPACK TESTING MATGEN routines. - * - * @defgroup aux_matgen aux - * @ingroup matgen - * This is the group of auxiliary LAPACK TESTING MATGEN routines. - * - * @defgroup single_lin real - * @ingroup lin - * This is the group of real LAPACK TESTING LIN routines. - * - * @defgroup double_lin double - * @ingroup lin - * This is the group of double LAPACK TESTING LIN routines. - * - * @defgroup complex_lin complex - * @ingroup lin - * This is the group of complex LAPACK TESTING LIN routines. - * - * @defgroup complex16_lin complex16 - * @ingroup lin - * This is the group of complex16 LAPACK TESTING LIN routines. - * - * @defgroup aux_lin aux - * @ingroup lin - * This is the group of auxiliary LAPACK TESTING LIN routines. - * - * @defgroup single_eig real - * @ingroup eig - * This is the group of real LAPACK TESTING EIG routines. - * - * @defgroup double_eig double - * @ingroup eig - * This is the group of double LAPACK TESTING EIG routines. - * - * @defgroup complex_eig complex - * @ingroup eig - * This is the group of complex LAPACK TESTING EIG routines. - * - * @defgroup complex16_eig complex16 - * @ingroup eig - * This is the group of complex16 LAPACK TESTING EIG routines. - * - * @defgroup aux_eig aux - * @ingroup eig - * This is the group of auxiliary LAPACK TESTING EIG routines. - * - **** - * @defgroup blas Reference BLAS - * This is the group of BLAS routines. - * - * @defgroup level1 Level1 - * @ingroup blas - * This is the group of LEVEL 1 BLAS routines. - * @defgroup level2 Level2 - * @ingroup blas - * This is the group of LEVEL 2 BLAS routines. - * @defgroup level3 Level3 - * @ingroup blas - * This is the group of LEVEL 3 BLAS routines. - * @defgroup aux_blas Auxiliary BLAS - * @ingroup blas - * This is the group of Auxiliary 3 BLAS routines. -* @defgroup blastesting Testing - * @ingroup blas - * This is the group of BLAS TESTING routines. - * - * @defgroup single_blas_level1 real - * @ingroup level1 - * This is the group of real LEVEL 1 BLAS routines. - * @defgroup double_blas_level1 double - * @ingroup level1 - * This is the group of double LEVEL 1 BLAS routines. - * @defgroup complex_blas_level1 complex - * @ingroup level1 - * This is the group of complex LEVEL 1 BLAS routines. - * @defgroup complex16_blas_level1 complex16 - * @ingroup level1 - * This is the group of complex16 LEVEL 1 BLAS routines. - * - * @defgroup single_blas_level2 real - * @ingroup level2 - * This is the group of real LEVEL 2 BLAS routines. - * @defgroup double_blas_level2 double - * @ingroup level2 - * This is the group of double LEVEL 2 BLAS routines. - * @defgroup complex_blas_level2 complex - * @ingroup level2 - * This is the group of complex LEVEL 2 BLAS routines. - * @defgroup complex16_blas_level2 complex16 - * @ingroup level2 - * This is the group of complex16 LEVEL 2 BLAS routines. - * - * @defgroup single_blas_level3 real - * @ingroup level3 - * This is the group of real LEVEL 3 BLAS routines. - * @defgroup double_blas_level3 double - * @ingroup level3 - * This is the group of double LEVEL 3 BLAS routines. - * @defgroup complex_blas_level3 complex - * @ingroup level3 - * This is the group of complex LEVEL 3 BLAS routines. - * @defgroup complex16_blas_level3 complex16 - * @ingroup level3 - * This is the group of complex16 LEVEL 3 BLAS routines. - * - * @defgroup single_blas_testing real - * @ingroup blastesting - * This is the group of real BLAS TESTING routines. - * @defgroup double_blas_testing double - * @ingroup blastesting - * This is the group of double BLAS TESTING routines. - * @defgroup complex_blas_testing complex - * @ingroup blastesting - * This is the group of complex BLAS TESTING routines. - * @defgroup complex16_blas_testing complex16 - * @ingroup blastesting - * This is the group of complex16 BLAS TESTING routines. - * + +@defgroup lapack_top LAPACK +@{ + @defgroup solve_top Linear solve, AX = B + @{ + @defgroup gesv_driver_grp LU: General matrix, driver + @{ + @defgroup gesv_driver --- full --- + @defgroup gesv gesv: factor and solve + @defgroup gesvx gesvx: factor and solve, expert + @defgroup gesvxx gesvxx: factor and solve, extra precise + @defgroup gesv_mixed gesv: factor and solve, mixed precision + + @defgroup gbsv_driver --- banded --- + @defgroup gbsv gbsv: factor and solve + @defgroup gbsvx gbsvx: factor and solve, expert + @defgroup gbsvxx gbsvxx: factor and solve, extra precise + + @defgroup gtsv_driver --- tridiagonal --- + @defgroup gtsv gtsv: factor and solve + @defgroup gtsvx gtsvx: factor and solve, expert + @} + + @defgroup gesv_comp_grp LU: computational routines (factor, cond, etc.) + @{ + @defgroup gesv_comp --- full --- + @defgroup gecon gecon: condition number estimate + @defgroup getrf getrf: triangular factor + @defgroup getrf2 getrf2: triangular factor panel, recursive? + @defgroup getf2 getf2: triangular factor panel, level 2 + @defgroup getrs getrs: triangular solve using factor + @defgroup getri getri: triangular inverse + @defgroup gerfs gerfs: iterative refinement + @defgroup gerfsx gerfsx: iterative refinement, expert + @defgroup geequ geequ: equilibration + @defgroup geequb geequb: equilibration, power of 2 + @defgroup laqge laqge: row/col scale matrix + @defgroup laswp laswp: swap permutation + + @defgroup getc2 getc2: triangular factor, with complete pivoting + @defgroup gesc2 gesc2: triangular solve using factor, with complete pivoting + @defgroup latdf latdf: Dif-estimate with complete pivoting LU, step in tgsen + + @defgroup la_gercond la_gercond: Skeel condition number estimate + @defgroup la_gerpvgrw la_gerpvgrw: reciprocal pivot growth + @defgroup la_gerfsx_extended la_gerfsx_extended: step in gerfsx + + @defgroup gbsv_comp --- banded --- + @defgroup gbcon gbcon: condition number estimate + @defgroup gbtrf gbtrf: triangular factor + @defgroup gbtf2 gbtf2: triangular factor, level 2 + @defgroup gbtrs gbtrs: triangular solve using factor + @defgroup gbrfs gbrfs: iterative refinement + @defgroup gbrfsx gbrfsx: iterative refinement, expert + @defgroup gbequ gbequ: equilibration + @defgroup gbequb gbequb: equilibration, power of 2 + @defgroup laqgb laqgb: row/col scale matrix + @defgroup la_gbrcond la_gbrcond: Skeel condition number estimate + @defgroup la_gbrpvgrw la_gbrpvgrw: reciprocal pivot growth + @defgroup la_gbrfsx_extended la_gbrfsx_extended: step in gbrfsx + + @defgroup gtsv_comp --- tridiagonal --- + @defgroup gtcon gtcon: condition number estimate + @defgroup gttrf gttrf: triangular factor + @defgroup gttrs gttrs: triangular solve using factor + @defgroup gtts2 gtts2: triangular solve using factor + @defgroup gtrfs gtrfs: iterative refinement + @} + + @defgroup posv_driver_grp Cholesky: Hermitian/symmetric positive definite matrix, driver + @{ + @defgroup posv_driver --- full --- + @defgroup posv posv: factor and solve + @defgroup posvx posvx: factor and solve, expert + @defgroup posvxx posvxx: factor and solve, extra precise + @defgroup posv_mixed posv: factor and solve, mixed precision + + @defgroup ppsv_driver --- packed --- + @defgroup ppsv ppsv: factor and solve + @defgroup ppsvx ppsvx: factor and solve, expert + + @defgroup pfsv_driver --- rectangular full packed (RFP) --- + @defgroup pfsv pfsv: factor and solve [not available] + + @defgroup pbsv_driver --- banded --- + @defgroup pbsv pbsv: factor and solve + @defgroup pbsvx pbsvx: factor and solve, expert + + @defgroup ptsv_driver --- tridiagonal --- + @defgroup ptsv ptsv: factor and solve + @defgroup ptsvx ptsvx: factor and solve, expert + @} + + @defgroup posv_comp_grp Cholesky: computational routines (factor, cond, etc.) + @{ + @defgroup posv_comp --- full --- + @defgroup pocon pocon: condition number estimate + @defgroup potrf potrf: triangular factor + @defgroup potrf2 potrf2: triangular factor panel, recursive? + @defgroup potf2 potf2: triangular factor panel, level 2 + @defgroup pstrf pstrf: triangular factor, with pivoting + @defgroup pstf2 pstf2: triangular factor, with pivoting panel, level 2 + @defgroup potrs potrs: triangular solve using factor + @defgroup potri potri: triangular inverse + @defgroup porfs porfs: iterative refinement + @defgroup porfsx porfsx: iterative refinement, expert + @defgroup poequ poequ: equilibration + @defgroup poequb poequb: equilibration, power of 2 + @defgroup laqhe laqhe: row/col scale matrix + @defgroup la_porcond la_porcond: Skeel condition number estimate + @defgroup la_porpvgrw la_porpvgrw: reciprocal pivot growth + @defgroup la_porfsx_extended la_porfsx_extended: step in porfsx + + @defgroup ppsv_comp --- packed --- + @defgroup ppcon ppcon: condition number estimate + @defgroup pptrf pptrf: triangular factor + @defgroup pptrs pptrs: triangular solve using factor + @defgroup pptri pptri: triangular inverse + @defgroup pprfs pprfs: iterative refinement + @defgroup ppequ ppequ: equilibration + @defgroup laqhp laqhp: row/col scale matrix + + @defgroup pfsv_comp --- rectangular full packed (RFP) --- + @defgroup pftrf pftrf: triangular factor + @defgroup pftrs pftrs: triangular solve using factor + @defgroup pftri pftri: triangular inverse + + @defgroup pbsv_comp --- banded --- + @defgroup pbcon pbcon: condition number estimate + @defgroup pbtrf pbtrf: triangular factor + @defgroup pbtf2 pbtf2: triangular factor panel, level 2 + @defgroup pbtrs pbtrs: triangular solve using factor + @defgroup pbrfs pbrfs: iterative refinement + @defgroup pbequ pbequ: equilibration + @defgroup laqhb laqhb: row/col scale matrix + + @defgroup ptsv_comp --- tridiagonal --- + @defgroup ptcon ptcon: condition number estimate + @defgroup pttrf pttrf: triangular factor + @defgroup pttrs pttrs: triangular solve using factor + @defgroup ptts2 ptts2: triangular solve using factor, unblocked + @defgroup ptrfs ptrfs: iterative refinement + @} + + @defgroup hesv_driver_grp LDL: Hermitian/symmetric indefinite matrix, driver + @{ + @defgroup hesv_driver --- full, rook pivoting --- + @defgroup hesv {he,sy}sv: rook (v1) + @defgroup hesv_rook {he,sy}sv_rook: rook (v2) + @defgroup hesv_rk {he,sy}sv_rk: rook (v3) + @defgroup hesvx {he,sy}svx: rook (v1, expert) + @defgroup hesvxx {he,sy}svxx: rook (v1, expert) + + @defgroup hpsv_driver --- packed, rook pivoting --- + @defgroup hpsv {hp,sp}sv: factor and solve + @defgroup hpsvx {hp,sp}svx: factor and solve, expert + + @defgroup hesv_aa_driver --- full, Aasen --- + @defgroup hesv_aa {he,sy}sv_aa: Aasen + @defgroup hesv_aa_2stage {he,sy}sv_aa_2stage: Aasen, blocked 2-stage + @} + + @defgroup hesv_comp_grp LDL: computational routines (factor, cond, etc.) + @{ + @defgroup hesv_comp_v1 --- full, rook v1 --- + @defgroup hecon {he,sy}con: condition number estimate + @defgroup hetrf {he,sy}trf: triangular factor + @defgroup lahef la{he,sy}f: step in hetrf + @defgroup hetf2 {he,sy}tf2: triangular factor, level 2 + @defgroup hetrs {he,sy}trs: triangular solve using factor + @defgroup hetri {he,sy}tri: triangular inverse + @defgroup herfs {he,sy}rfs: iterative refinement + @defgroup herfsx {he,sy}rfsx: iterative refinement, expert + @defgroup heequb {he,sy}equb: equilibration, power of 2 + @defgroup syconv syconv: convert to/from L and D from hetrf + + @defgroup hecon_3 {he,sy}con_3: condition number estimate + @defgroup hetri2 {he,sy}tri2: inverse + @defgroup hetri2x {he,sy}tri2x: inverse + @defgroup hetri_3 {he,sy}tri_3: inverse + @defgroup hetri_3x {he,sy}tri_3x: inverse + @defgroup hetrs2 {he,sy}trs2: solve using factor + @defgroup hetrs_3 {he,sy}trs_3: solve using factor + + @defgroup heswapr {he,sy}swapr: apply 2-sided permutation + @defgroup la_hercond la_hercond: Skeel condition number estimate + @defgroup la_herfsx_extended la_herfsx_extended: step in herfsx + @defgroup la_herpvgrw la_herpvgrw: reciprocal pivot growth + + @defgroup hpsv_comp --- packed, rook v1 --- + @defgroup hpcon {hp,sp}con: condition number estimate + @defgroup hptrf {hp,sp}trf: triangular factor + @defgroup hptrs {hp,sp}trs: triangular solve using factor + @defgroup hptri {hp,sp}tri: triangular inverse + @defgroup hprfs {hp,sp}rfs: iterative refinement + + @defgroup hesv_comp_v2 --- full, rook v2 --- + @defgroup hecon_rook {he,sy}con_rook: condition number estimate + @defgroup hetrf_rook {he,sy}trf_rook: triangular factor + @defgroup lahef_rook la{he,sy}f_rook: triangular factor step + @defgroup hetf2_rook {he,sy}tf2_rook: triangular factor, level 2 + @defgroup hetrs_rook {he,sy}trs_rook: triangular solve using factor + @defgroup hetri_rook {he,sy}tri_rook: triangular inverse + + @defgroup hesv_comp_v3 --- full, rook v3 --- + @defgroup hetrf_rk {he,sy}trf_rk: triangular factor + @defgroup lahef_rk la{he,sy}f_rk: triangular factor step + @defgroup hetf2_rk {he,sy}tf2_rk: triangular factor, level 2 + @defgroup syconvf syconvf: convert to/from hetrf to hetrf_rk format + @defgroup syconvf_rook syconvf_rook: convert to/from hetrf_rook to hetrf_rk format + + @defgroup hesv_comp_aasen --- full, Aasen --- + @defgroup hetrf_aa {he,sy}trf_aa: triangular factor + @defgroup lahef_aa la{he,sy}f_aa: triangular factor partial factor + @defgroup hetrs_aa {he,sy}trs_aa: triangular solve using factor + + @defgroup hesv_comp_aasen2 --- full, Aasen, blocked 2-stage --- + @defgroup hetrf_aa_2stage {he,sy}trf_aa_2stage: triangular factor + @defgroup hetrs_aa_2stage {he,sy}trs_aa_2stage: triangular solve using factor + @} + + @defgroup trsv_comp_grp Triangular computational routines (solve, cond, etc.) + @{ + @defgroup trsv_comp --- full --- + @defgroup trcon trcon: condition number estimate + @defgroup trtrs trtrs: triangular solve + @defgroup latrs latrs: triangular solve with robust scaling + @defgroup latrs3 latrs3: triangular solve with robust scaling, level 3 + @defgroup trtri trtri: triangular inverse + @defgroup trti2 trti2: triangular inverse, level 2 + @defgroup trrfs trrfs: triangular iterative refinement + @defgroup lauum lauum: triangular multiply: U^H U + @defgroup lauu2 lauu2: triangular multiply: U^H U, level 2 + + @defgroup tpsv_comp --- packed --- + @defgroup tpcon tpcon: condition number estimate + @defgroup tptrs tptrs: triangular solve + @defgroup latps latps: triangular solve with robust scaling + @defgroup tptri tptri: triangular inverse + @defgroup tprfs tprfs: triangular iterative refinement + + @defgroup tfsv_comp --- rectangular full packed (RFP) --- + @defgroup tftri tftri: triangular inverse, RFP + + @defgroup tbsv_comp --- banded --- + @defgroup tbcon tbcon: condition number estimate + @defgroup tbtrs tbtrs: triangular solve + @defgroup latbs latbs: triangular solve with scaling + @defgroup tbrfs tbrfs: triangular iterative refinement + @} + + @defgroup solve_aux_grp Auxiliary routines + @{ + @defgroup lacn2 lacn2: 1-norm estimate, e.g., || A^{-1} ||_1 in gecon + @defgroup lacon lacon: 1-norm estimate, e.g., || A^{-1} ||_1 in gecon, old + @defgroup la_lin_berr la_lin_berr: backward error + @} + @} + + @defgroup gels_top Least squares + @{ + @defgroup gels_driver_grp Standard least squares, min || Ax - b ||_2 + @{ + @defgroup gels gels: least squares using QR/LQ + @defgroup gelst gelst: least squares using QR/LQ with T matrix + @defgroup gelss gelss: least squares using SVD, QR iteration + @defgroup gelsd gelsd: least squares using SVD, divide and conquer + @defgroup gelsy gelsy: least squares using complete orthogonal factor + @defgroup getsls getsls: least squares using tall-skinny QR/LQ + @} + + @defgroup ggls_driver_grp Constrained least squares + @{ + @defgroup gglse gglse: equality-constrained least squares + @defgroup ggglm ggglm: Gauss-Markov linear model + @} + + @defgroup gels_aux_grp Auxiliary routines + @{ + @defgroup laic1 laic1: condition estimate, step in gelsy + @defgroup lals0 lals0: back multiplying factors, step in gelsd + @defgroup lalsa lalsa: SVD of coefficient matrix, step in gelsd + @defgroup lalsd lalsd: uses SVD for least squares, step in gelsd + @} + @} + + @defgroup unitary_top Orthogonal/unitary factors (QR, CS, etc.) + @{ + @defgroup geqr_comp_grp QR + @{ + @defgroup geqr_comp1 --- flexible --- + @defgroup geqr geqr: QR factor, flexible + @defgroup gemqr gemqr: multiply by Q from geqr + + @defgroup geqr_comp2 --- classic --- + @defgroup geqrf geqrf: QR factor + @defgroup geqr2 geqr2: QR factor, level 2 + @defgroup ungqr {un,or}gqr: generate explicit Q from geqrf + @defgroup ung2r {un,or}g2r: generate explicit Q from geqrf, level 2 + @defgroup unmqr {un,or}mqr: multiply by Q from geqrf + @defgroup unm2r {un,or}m2r: multiply by Q from geqrf, level 2 + + @defgroup geqr_comp3 --- with T --- + @defgroup geqrt geqrt: QR factor, with T + @defgroup geqrt2 geqrt2: QR factor, with T, level 2 + @defgroup geqrt3 geqrt3: QR factor, with T, recursive panel + @defgroup gemqrt gemqrt: multiply by Q from geqrt + + @defgroup geqr_comp4 --- positive --- + @defgroup geqrfp geqrfp: QR factor, diag( R ) ≥ 0 + @defgroup geqr2p geqr2p: QR factor, diag( R ) ≥ 0, level 2 + @} + + @defgroup geqpf_comp_grp QR with pivoting + @{ + @defgroup geqp3 geqp3: QR factor with pivoting, level 3 + @defgroup laqp2 laqp2: step of geqp3 + @defgroup laqps laqps: step of geqp3 + @} + + @defgroup getsqr_comp_grp QR, tall-skinny + @{ + @defgroup latsqr latsqr: tall-skinny QR factor + @defgroup ungtsqr {un,or}gtsqr: generate Q from latsqr + @defgroup ungtsqr_row {un,or}gtsqr_row: generate Q from latsqr + @defgroup larfb_gett larfb_gett: step in ungtsqr_row + @defgroup lamtsqr lamtsqr: multiply by Q from latsqr + @defgroup getsqrhrt getsqrhrt: tall-skinny QR factor, with Householder reconstruction + @defgroup unhr_col {un,or}hr_col: Householder reconstruction + @defgroup launhr_col_getrfnp la{un,or}hr_col_getrfnp: LU factor without pivoting + @defgroup launhr_col_getrfnp2 la{un,or}hr_col_getrfnp2: LU factor without pivoting, level 2 + @} + + @defgroup tpqr_comp_grp QR, triangular-pentagonal + @{ + @defgroup tpqrt tpqrt: QR factor + @defgroup tpqrt2 tpqrt2: QR factor, level 2 + @defgroup tpmqrt tpmqrt: applies Q + @defgroup tprfb tprfb: applies Q (like larfb) + @} + + @defgroup ggqr_comp_grp Generalized QR + @{ + @defgroup ggqrf ggqrf: Generalized QR factor + @} + + @defgroup gelq_comp_grp LQ + @{ + @defgroup gelq_comp1 --- flexible --- + @defgroup gelq gelq: LQ factor, flexible + @defgroup gemlq gemlq: multiply by Q from gelq + + @defgroup gelq_comp2 --- classic --- + @defgroup gelqf gelqf: LQ factor + @defgroup gelq2 gelq2: LQ factor, level 2 + @defgroup unglq {un,or}glq: generate explicit Q from gelqf + @defgroup ungl2 {un,or}gl2: generate explicit Q, level 2, step in unglq + @defgroup unmlq {un,or}mlq: multiply by Q from gelqf + @defgroup unml2 {un,or}ml2: multiply by Q, level 2, step in unmlq + + @defgroup gelq_comp3 --- with T --- + @defgroup gelqt gelqt: LQ factor, with T + @defgroup gelqt3 gelqt3: LQ factor, with T, recursive + @defgroup gemlqt gemlqt: multiply by Q from gelqt + @} + + @defgroup geswlq_comp_grp LQ, short-wide + @{ + @defgroup laswlq laswlq: short-wide LQ factor + @defgroup lamswlq lamswlq: multiply by Q from laswlq + @} + + @defgroup tplq_comp_grp LQ, triangular-pentagonal + @{ + @defgroup tplqt tplqt: QR factor + @defgroup tplqt2 tplqt2: QR factor, level 2 + @defgroup tpmlqt tpmlqt: applies Q + @} + + @defgroup geql_comp_grp QL + @{ + @defgroup geqlf geqlf: QL factor + @defgroup geql2 geql2: QL factor, level 2 + @defgroup ungql {un,or}gql: generate explicit Q from geqlf + @defgroup unmql {un,or}mql: multiply by Q from geqlf + @defgroup ung2l {un,or}g2l: step in ungql + @defgroup unm2l {un,or}m2l: step in unmql + @} + + @defgroup gerq_comp_grp RQ + @{ + @defgroup gerqf gerqf: RQ factor + @defgroup gerq2 gerq2: RQ factor, level 2 + @defgroup ungrq {un,or}grq: generate explicit Q from gerqf + @defgroup unmrq {un,or}mrq: multiply by Q from gerqf + @defgroup unmr2 {un,or}mr2: step in unmrq + @defgroup ungr2 {un,or}gr2: step in ungrq + @} + + @defgroup ggrq_comp_grp Generalized RQ + @{ + @defgroup ggrqf ggrqf: Generalized RQ factor + @} + + @defgroup gerz_comp_grp RZ + @{ + @defgroup tzrzf tzrzf: RZ factor + @defgroup latrz latrz: RZ factor step + @defgroup unmrz {un,or}mrz: multiply by Z from tzrzf + @defgroup unmr3 {un,or}mr3: step in unmrz + @defgroup larz larz: apply reflector + @defgroup larzb larzb: apply block reflector + @defgroup larzt larzt: generate T matrix + @} + + @defgroup gecs_comp_grp Cosine-Sine (CS) decomposition + @{ + @defgroup bbcsd bbcsd: ?? + @defgroup uncsd {un,or}csd: ?? + @defgroup uncsd2by1 {un,or}csd2by1: ?? + @defgroup unbdb {un,or}bdb: bidiagonalize partitioned unitary matrix, step in uncsd + @defgroup unbdb1 {un,or}bdb1: step in uncsd2by1 + @defgroup unbdb2 {un,or}bdb2: step in uncsd2by1 + @defgroup unbdb3 {un,or}bdb3: step in uncsd2by1 + @defgroup unbdb4 {un,or}bdb4: step in uncsd2by1 + @defgroup unbdb5 {un,or}bdb5: step in uncsd2by1 + @defgroup unbdb6 {un,or}bdb6: step in uncsd2by1 + + @defgroup lapmr lapmr: permute rows + @defgroup lapmt lapmt: permute cols + @} + + @defgroup reflector_aux_grp Householder reflectors + @{ + @defgroup larf larf: apply Householder reflector + @defgroup larfx larfx: apply Householder reflector, unrolled + @defgroup larfy larfy: apply Householder reflector symmetrically (2-sided) + @defgroup larfb larfb: apply block Householder reflector + @defgroup larfg larfg: generate Householder reflector + @defgroup larfgp larfgp: generate Householder reflector, beta ≥ 0 + @defgroup larft larft: generate T matrix + @} + + @defgroup rot_aux_grp Givens/Jacobi plane rotations + @{ + @defgroup lartg lartg: generate plane rotation, more accurate than BLAS rot + @defgroup lartgp lartgp: generate plane rotation, more accurate than BLAS rot + @defgroup lasr lasr: apply series of plane rotations + @defgroup largv largv: generate vector of plane rotations + @defgroup lartv lartv: apply vector of plane rotations to vectors + @defgroup lar2v lar2v: apply vector of plane rotations to 2x2 matrices + @defgroup lacrt lacrt: apply plane rotation (unused?) + @} + @} + + @defgroup geev_top Non-symmetric eigenvalues + @{ + @defgroup geev_driver_grp Standard eig driver, AV = VΛ + @{ + @defgroup geev geev: eig + @defgroup geevx geevx: eig, expert + + @defgroup gees gees: Schur form + @defgroup geesx geesx: Schur form, expert + @} + + @defgroup ggev_driver_grp Generalized eig driver + @{ + @defgroup ggev3 ggev3: eig + @defgroup ggev ggev: eig, unblocked + @defgroup ggevx ggevx: eig, expert + + @defgroup gges3 gges3: Schur form + @defgroup gges gges: Schur form, unblocked + @defgroup ggesx ggesx: Schur form, expert + @} + + @defgroup gedmd DMD driver, Dynamic Mode Decomposition + + @defgroup geev_comp_grp Eig computational routines + @{ + @defgroup gebal gebal: balance matrix + @defgroup gehrd gehrd: reduction to Hessenberg + @defgroup gehd2 gehd2: reduction to Hessenberg, level 2 + @defgroup lahr2 lahr2: step in gehrd + @defgroup unghr {un,or}ghr: generate Q from gehrd + @defgroup unmhr {un,or}mhr: multiply by Q from gehrd + @defgroup gebak gebak: back-transform eigvec + @defgroup hseqr hseqr: Hessenberg eig, QR iteration + @defgroup hsein hsein: Hessenberg inverse iteration for eigvec + @defgroup trevc trevc: eigenvectors of triangular Schur form, old + @defgroup trevc3 trevc3: eigenvectors of triangular Schur form, blocked + @defgroup laln2 laln2: 1x1 or 2x2 solve, step in trevc + + @defgroup trsyl trsyl: Sylvester equation + @defgroup trsyl3 trsyl3: Sylvester equation, level 3 + @defgroup lasy2 lasy2: Sylvester equation + + @defgroup trsna trsna: eig condition numbers + @defgroup laqtr laqtr: quasi-triangular solve + + @defgroup trexc trexc: reorder Schur form + @defgroup trsen trsen: reorder Schur form + @defgroup laexc laexc: reorder Schur form + + @defgroup lanv2 lanv2: 2x2 Schur factor + + @defgroup laqr_group --- hseqr auxiliary --- + @defgroup laein laein: eigvec by Hessenberg inverse iteration + @defgroup lahqr lahqr: eig of Hessenberg, step in hseqr + @defgroup laqr0 laqr0: eig of Hessenberg, step in hseqr + @defgroup laqr1 laqr1: step in hseqr + @defgroup laqr2 laqr2: step in hseqr + @defgroup laqr3 laqr3: step in hseqr + @defgroup laqr4 laqr4: eig of Hessenberg, step in hseqr + @defgroup laqr5 laqr5: step in hseqr + + @defgroup iparmq iparmq: set parameters for hseqr + + @defgroup laqz_group --- ggev3, gges3 auxiliary --- + @defgroup laqz0 laqz0: step in ggev3, gges3 + @defgroup laqz1 laqz1: step in ggev3, gges3 + @defgroup laqz2 laqz2: step in ggev3, gges3 + @defgroup laqz3 laqz3: step in ggev3, gges3 + @defgroup laqz4 laqz4: step in ggev3, gges3 + @} + + @defgroup ggev_comp_grp Generalized eig computational routines + @{ + @defgroup ggbal ggbal: balance matrix + @defgroup gghrd gghrd: reduction to Hessenberg + @defgroup gghd3 gghd3: reduction to Hessenberg, level 3 + @defgroup hgeqz hgeqz: generalized Hessenberg eig + @defgroup ggbak ggbak: back-transform eigvec + @defgroup tgsen tgsen: reorder generalized Schur form + @defgroup tgsna tgsna: reciprocal cond est + @defgroup tgsyl tgsyl: Sylvester equation + @defgroup tgsy2 tgsy2: Sylvester equation panel (?) + @defgroup unm22 {un,or}m22: multiply by banded Q, step in gghd3 + @defgroup lagv2 lagv2: 2x2 generalized Schur factor + @defgroup tgevc tgevc: eigvec of pair of matrices + @defgroup tgexc tgexc: reorder generalized Schur form + @defgroup tgex2 tgex2: reorder generalized Schur form + @} + @} + + @defgroup heev_top Hermitian/symmetric eigenvalues + @{ + @defgroup heev_driver_grp Standard eig driver, AV = VΛ + @{ + @defgroup heev_driver --- full --- + @defgroup heev {he,sy}ev: eig, QR iteration + @defgroup heevd {he,sy}evd: eig, divide and conquer + @defgroup heevr {he,sy}evr: eig, MRRR + @defgroup heevx {he,sy}evx: eig, bisection + + @defgroup heev_driver2 --- full, 2-stage --- + @defgroup heev_2stage {he,sy}ev_2stage: eig, QR iteration + @defgroup heevd_2stage {he,sy}evd_2stage: eig, divide and conquer + @defgroup heevr_2stage {he,sy}evr_2stage: eig, MRRR + @defgroup heevx_2stage {he,sy}evx_2stage: eig, bisection + + @defgroup hpev_driver --- packed --- + @defgroup hpev {hp,sp}ev: eig, QR iteration + @defgroup hpevd {hp,sp}evd: eig, divide and conquer + @defgroup hpevx {hp,sp}evx: eig, bisection + + @defgroup hbev_driver --- banded --- + @defgroup hbev {hb,sb}ev: eig, QR iteration + @defgroup hbevd {hb,sb}evd: eig, divide and conquer + @defgroup hbevx {hb,sb}evx: eig, bisection + + @defgroup hbev_driver2 --- banded, 2nd-stage --- + @defgroup hbev_2stage {hb,sb}ev_2stage: eig, QR iteration + @defgroup hbevd_2stage {hb,sb}evd_2stage: eig, divide and conquer + @defgroup hbevx_2stage {hb,sb}evx_2stage: eig, bisection + + @defgroup stev_driver --- tridiagonal --- + @defgroup stev stev: eig, QR iteration + @defgroup stevd stevd: eig, divide and conquer + @defgroup stevr stevr: eig, MRRR + @defgroup stevx stevx: eig, bisection + @defgroup pteqr pteqr: eig, positive definite tridiagonal + + @defgroup stebz stebz: eig, Kahan + @defgroup sterf sterf: eig, QR iteration + @defgroup stedc stedc: eig, divide and conquer + @defgroup stegr stegr: eig, bisection, see stemr + @defgroup stein stein: eig, inverse iteration + @defgroup stemr stemr: eig, relatively robust representation (RRR) + @defgroup steqr steqr: eig, QR iteration + @} + + @defgroup hegv_driver_grp Generalized eig driver, AV = BVΛ, etc. + @{ + @defgroup hegv_driver --- full --- + @defgroup hegv {he,sy}gv: eig, QR iteration + @defgroup hegv_2stage {he,sy}gv_2stage: eig, QR iteration, 2-stage + @defgroup hegvd {he,sy}gvd: eig, divide and conquer + @defgroup hegvx {he,sy}gvx: eig, bisection + + @defgroup hpgv_driver --- packed --- + @defgroup hpgv {hp,sp}gv: eig, QR iteration + @defgroup hpgvd {hp,sp}gvd: eig, divide and conquer + @defgroup hpgvx {hp,sp}gvx: eig, bisection + + @defgroup hbgv_driver --- banded --- + @defgroup hbgv {hb,sb}gv: eig, QR iteration + @defgroup hbgvd {hb,sb}gvd: eig, divide and conquer + @defgroup hbgvx {hb,sb}gvx: eig, bisection + @} + + @defgroup heev_comp_grp Eig computational routines + @{ + @defgroup heev_comp --- full --- + @defgroup disna disna: eig condition numbers + @defgroup hetrd {he,sy}trd: reduction to tridiagonal + @defgroup hetd2 {he,sy}td2: reduction to tridiagonal, level 2 + @defgroup latrd latrd: step in hetrd + @defgroup ungtr {un,or}gtr: generate Q from hetrd + @defgroup unmtr {un,or}mtr: multiply by Q from hetrd + + @defgroup hetrd_2stage {he,sy}trd_2stage: reduction to tridiagonal, 2-stage + @defgroup hetrd_he2hb {he,sy}trd_he2hb: full to band (1st stage) + @defgroup hetrd_hb2st {he,sy}trd_hb2st: band to tridiagonal (2nd stage) + @defgroup hb2st_kernels {hb,sb}2st_kernels: band to tridiagonal (2nd stage) + + @defgroup lae2 lae2: 2x2 eig, step in steqr, stemr + @defgroup laesy laesy: 2x2 eig + @defgroup laev2 laev2: 2x2 eig + @defgroup lagtf lagtf: LU factor of (T - λI) + @defgroup lagts lagts: LU solve of (T - λI) x = y + + @defgroup hpev_comp --- packed --- + @defgroup hptrd {hp,sp}trd: reduction to tridiagonal + @defgroup upgtr {up,op}gtr: generate Q from hetrd + @defgroup upmtr {up,op}mtr: multiply by Q from hptrd + + @defgroup hbev_comp --- banded --- + @defgroup hbtrd {hb,sb}trd: reduction to tridiagonal + @} + + @defgroup hegv_comp_grp Generalized eig computational routines + @{ + @defgroup hegst {he,sy}gst: reduction to standard form + @defgroup hegs2 {he,sy}gs2: reduction to standard form, level 2 + @defgroup hpgst {hp,sp}gst: reduction to standard form, packed + @defgroup hbgst {hb,sb}gst: reduction to standard form, banded + @defgroup pbstf pbstf: split Cholesky factor, use with hbgst + @defgroup lag2 lag2: 2x2 eig + @} + + @defgroup stev_comp_grp tridiag bisection routines + @{ + @defgroup laebz laebz: counts eigvals <= value + @defgroup laneg laneg: Sturm count + @} + + @defgroup laed_comp_grp tridiag divide and conquer (D&C) routines + @{ + @defgroup laed0 laed0: D&C step: top level solver + @defgroup laed1 laed1: D&C step: merge subproblems + @defgroup laed2 laed2: D&C step: deflation + @defgroup laed3 laed3: D&C step: secular equation + @defgroup laed4 laed4: D&C step: secular equation nonlinear solver + @defgroup laed5 laed5: D&C step: secular equation, 2x2 + @defgroup laed6 laed6: D&C step: secular equation Newton step + @defgroup lamrg lamrg: permutation to merge 2 sorted lists + + @defgroup laed_comp2 --- eig value only or update Q --- + @defgroup laed7 laed7: D&C step: merge subproblems + @defgroup laed8 laed8: D&C step: deflation + @defgroup laed9 laed9: D&C step: secular equation + @defgroup laeda laeda: D&C step: z vector + @} + + @defgroup larr_comp_grp tridiag RRR routines + @{ + @defgroup larra larra: step in stemr + @defgroup larrb larrb: step in stemr + @defgroup larrc larrc: step in stemr + @defgroup larrd larrd: step in stemr, tridiag eig + @defgroup larre larre: step in stemr + @defgroup larrf larrf: step in stemr, find relative robust representation (RRR) + @defgroup larrj larrj: step in stemr, refine eigval estimates + @defgroup larrk larrk: step in stemr, compute one eigval + @defgroup larrr larrr: step in stemr, test to do expensive tridiag eig algorithm + @defgroup larrv larrv: eig tridiagonal, step in stemr & stegr + @defgroup lar1v lar1v: step in larrv, hence stemr & stegr + @} + @} + + @defgroup svd_top Singular Value Decomposition (SVD) + @{ + @defgroup svd_driver_grp Standard SVD driver, A = UΣV^H + @{ + @defgroup gesvd_driver --- full --- + @defgroup gesvd gesvd: SVD, QR iteration + @defgroup gesvdq gesvdq: SVD, QR with pivoting + @defgroup gesdd gesdd: SVD, divide and conquer + @defgroup gesvdx gesvdx: SVD, bisection + @defgroup gejsv gejsv: SVD, Jacobi, high-level + @defgroup gesvj gesvj: SVD, Jacobi, low-level + + @defgroup bdsvd_driver --- bidiagonal --- + @defgroup bdsqr bdsqr: bidiagonal SVD, QR iteration (dqds) + @defgroup bdsdc bdsdc: bidiagonal SVD, divide and conquer + @defgroup bdsvdx bdsvdx: bidiagonal SVD, bisection + @} + + @defgroup ggsvd_driver_grp Generalized SVD driver + @{ + @defgroup ggsvd3 ggsvd3: SVD, QR iteration + @} + + @defgroup gesvd_comp_grp SVD computational routines + @{ + @defgroup gebrd gebrd: reduction to bidiagonal + @defgroup gebd2 gebd2: reduction to bidiagonal, level 2 + @defgroup labrd labrd: step in gebrd + @defgroup gbbrd gbbrd: band to bidiagonal + @defgroup ungbr {un,or}gbr: generate Q, P from gebrd + @defgroup unmbr {un,or}mbr: multiply by Q, P from gebrd + + @defgroup gesvd_aux --- auxiliary routines --- + @defgroup gsvj0 gsvj0: step in gesvj + @defgroup gsvj1 gsvj1: step in gesvj + @defgroup las2 las2: 2x2 triangular SVD + @defgroup lasv2 lasv2: 2x2 triangular SVD + @defgroup lartgs lartgs: generate plane rotation for bidiag SVD + @} + + @defgroup ggsvd_comp_grp Generalized SVD computational routines + @{ + @defgroup ggsvp3 ggsvp3: step in ggsvd + @defgroup tgsja tgsja: generalized SVD of trapezoidal matrices, step in ggsvd3 + @defgroup lags2 lags2: 2x2 orthogonal factor, step in tgsja + @defgroup lapll lapll: linear dependence of 2 vectors + @} + + @defgroup lasq_comp_grp bidiag QR iteration routines + @{ + @defgroup lasq1 lasq1: dqds step + @defgroup lasq2 lasq2: dqds step + @defgroup lasq3 lasq3: dqds step + @defgroup lasq4 lasq4: dqds step + @defgroup lasq5 lasq5: dqds step + @defgroup lasq6 lasq6: dqds step + @} + + @defgroup lasd_comp_grp bidiag D&C routines + @{ + @defgroup lasd0 lasd0: D&C step: top level solver + @defgroup lasdt lasdt: D&C step: tree + @defgroup lasd1 lasd1: D&C step: merge subproblems + @defgroup lasd2 lasd2: D&C step: deflation + @defgroup lasd3 lasd3: D&C step: secular equation + @defgroup lasd4 lasd4: D&C step: secular equation nonlinear solver + @defgroup lasd5 lasd5: D&C step: secular equation, 2x2 + @defgroup lasdq lasdq: D&C step: leaf using bdsqr + + @defgroup lasd_comp2 --- singular values only or factored form --- + @defgroup lasda lasda: D&C step: top level solver + @defgroup lasd6 lasd6: D&C step: merge subproblems + @defgroup lasd7 lasd7: D&C step: deflation + @defgroup lasd8 lasd8: D&C step: secular equation + @} + @} + + @defgroup blas_like_top BLAS-like + @{ + @defgroup set_grp Initialize, copy, convert + @{ + @defgroup laset laset: set matrix + @defgroup larnv larnv: random vector + @defgroup laruv laruv: random uniform vector + + @defgroup lacpy lacpy: copy matrix + + @defgroup lacp2 lacp2: general matrix, convert real to complex + @defgroup _lag2_ _lag2_: general matrix, convert double <=> single + @defgroup _lat2_ _lat2_: triangular matrix, convert double <=> single + + @defgroup tfttp tfttp: triangular matrix, RFP (tf) to packed (tp) + @defgroup tfttr tfttr: triangular matrix, RFP (tf) to full (tr) + @defgroup tpttf tpttf: triangular matrix, packed (tp) to RFP (tf) + @defgroup tpttr tpttr: triangular matrix, packed (tp) to full (tr) + @defgroup trttf trttf: triangular matrix, full (tr) to RFP (tf) + @defgroup trttp trttp: triangular matrix, full (tr) to packed (tp) + @} + + @defgroup norm_grp Matrix norm + @{ + @defgroup lange lange: general matrix + @defgroup langb langb: general matrix, banded + @defgroup langt langt: general matrix, tridiagonal + + @defgroup lanhs lanhs: Hessenberg + + @defgroup lanhe lan{he,sy}: Hermitian/symmetric matrix + @defgroup lanhf lan{hf,sf}: Hermitian/symmetric matrix, RFP + @defgroup lanhp lan{hp,sp}: Hermitian/symmetric matrix, packed + @defgroup lanhb lan{hb,sb}: Hermitian/symmetric matrix, banded + @defgroup lanht lan{ht,st}: Hermitian/symmetric matrix, tridiagonal + + @defgroup lantr lantr: triangular matrix + @defgroup lantp lantp: triangular matrix, packed + @defgroup lantb lantb: triangular matrix, banded + @} + + @defgroup blas0_like_grp Scalar operations + @{ + @defgroup isnan isnan: test for NaN + @defgroup laisnan laisnan: test for NaN, unoptimized + @defgroup ladiv ladiv: complex divide + @defgroup lapy2 lapy2: robust sqrt( x^2 + y^2 ) + @defgroup lapy3 lapy3: robust sqrt( x^2 + y^2 + z^2 ) + @defgroup larmm larmm: scale factor to avoid overflow, step in latrs + @} + + @defgroup blas1_like_grp Level 1 BLAS-like vector ops + @{ + @defgroup lacgv lacgv: conjugate vector + @defgroup lasrt lasrt: sort vector + @defgroup lassq lassq: sum-of-squares, avoiding over/underflow + @defgroup rscl rscl: scale vector by reciprocal + @} + + @defgroup blas2_like_grp Level 2 BLAS-like matrix-vector ops + @{ + @defgroup ilalc ilalc: find non-zero col + @defgroup ilalr ilalr: find non-zero row + @defgroup lascl lascl: scale matrix + @defgroup la_geamv la_geamv: matrix-vector multiply |A| * |x|, general + @defgroup la_gbamv la_gbamv: matrix-vector multiply |A| * |x|, general banded + @defgroup la_heamv la_heamv: matrix-vector multiply |A| * |x|, Hermitian/symmetric + @defgroup lascl2 lascl2: diagonal scale matrix, A = D A + @defgroup larscl2 larscl2: reciprocal diagonal scale matrix, A = D^{-1} A + @defgroup la_wwaddw la_wwaddw: add to double-double or single-single vector + @} + + @defgroup blas3_like_grp Level 3 BLAS-like matrix-matrix ops + @{ + @defgroup lagtm lagtm: tridiagonal matrix-matrix multiply + @defgroup lacrm lacrm: complex * real matrix-matrix multiply + @defgroup larcm larcm: real * complex matrix-matrix multiply + @defgroup hfrk hfrk: Hermitian rank-k update, RFP format + @defgroup tfsm tfsm: triangular-matrix solve, RFP format + @} + @} + + @defgroup aux_top Auxiliary routines + @{ + @defgroup aux_grp Other auxiliary routines + @{ + @defgroup lsame lsame: string comparison + @defgroup lsamen lsamen: string comparison + @defgroup roundup_lwork roundup_lwork: fix rounding integer to float + @defgroup second second: wall clock timer + @} + + @defgroup params_grp Parameters + @{ + @defgroup lamch lamch: machine parameters + @defgroup lamc1 lamc1: ?? + @defgroup lamc2 lamc2: ?? + @defgroup lamc3 lamc3: ?? + @defgroup lamc4 lamc4: ?? + @defgroup lamc5 lamc5: ?? + @defgroup labad labad: over/underflow on obsolete pre-IEEE machines + @defgroup ilaver ilaver: LAPACK version + @defgroup ilaenv ilaenv: tuning parameters + @defgroup ilaenv2stage ilaenv2stage: tuning parameters for 2-stage eig + @defgroup iparam2stage iparam2stage: sets parameters for 2-stage eig + @defgroup ieeeck ieeeck: verify inf and NaN are safe + @defgroup la_constants la_constants: Fortran 95 module of constants + + @defgroup blast_aux --- BLAST constants --- + @defgroup iladiag iladiag: diag string to BLAST const + @defgroup ilaprec ilaprec: precision string to BLAST const + @defgroup ilatrans ilatrans: trans string to BLAST const + @defgroup ilauplo ilauplo: uplo string to BLAST const + @defgroup la_transtype la_transtype: BLAST const to string + @} + + @defgroup xerbla_grp Error reporting + @{ + @defgroup xerbla xerbla: error reporting + @defgroup xerbla_array xerbla_array: error reporting, callable from C + @} + @} +@} + +@defgroup blas_top BLAS +BLAS are defined by three papers: +Basic linear algebra subprograms for {FORTRAN} usage, Lawson et al, 1979. +An extended set of {FORTRAN} basic linear algebra subprograms, Dongarra et al, 1988. +A set of level 3 basic linear algebra subprograms, Dongarra et al, 1990. +Some BLAS-like routines (e.g., csymv, crot, csum1, icmax1) exist in +LAPACK rather than the classic BLAS. +These were extended by the Extra Precision BLAS (XBLAS, not documented here) +https://www.netlib.org/xblas/ + +@{ + @defgroup scalar_grp Scalar operations + @{ + @defgroup abs1 abs1: | real( x ) | + | imag( x ) | + @} + + @defgroup blas1_grp Level 1 BLAS: vector ops + @{ + @defgroup asum asum: sum | real( x_i ) | + | imag( x_i ) | + @defgroup sum1 sum1: sum | x_i | (in LAPACK) + @defgroup axpy axpy: y = ax + y + @defgroup copy copy: y = x + @defgroup dot dot: x^H x and x^T x + @defgroup iamax iamax: argmax_i | real( x_i ) | + | imag( x_i ) | + @defgroup imax1 imax1: argmax_i | x_i | (in LAPACK) + @defgroup nrm2 nrm2: || x ||_2 + @defgroup scal scal: x = alpha x + @defgroup swap swap: x <=> y + + @defgroup rot_comp --- Givens/Jacobi plane rotations --- + @defgroup rot rot: apply plane rotation ([cz]rot in LAPACK) + @defgroup rotg rotg: generate plane rotation (cf. lartg) + @defgroup rotm rotm: apply modified (fast) plane rotation + @defgroup rotmg rotmg: generate modified (fast) plane rotation + @} + + @defgroup blas2_grp Level 2 BLAS: matrix-vector ops + @{ + @defgroup blas2_full --- full --- + @defgroup gemv gemv: general matrix-vector multiply + @defgroup ger ger: general matrix rank-1 update + + @defgroup hemv {he,sy}mv: Hermitian/symmetric matrix-vector multiply ([cz]symv in LAPACK) + @defgroup her {he,sy}r: Hermitian/symmetric rank-1 update + @defgroup her2 {he,sy}r2: Hermitian/symmetric rank-2 update + + @defgroup trmv trmv: triangular matrix-vector multiply + @defgroup trsv trsv: triangular matrix-vector solve + + @defgroup blas2_packed --- packed --- + @defgroup hpmv {hp,sp}mv: Hermitian/symmetric matrix-vector multiply + @defgroup hpr {hp,sp}r: Hermitian/symmetric rank-1 update + @defgroup hpr2 {hp,sp}r2: Hermitian/symmetric rank-2 update + + @defgroup tpmv tpmv: triangular matrix-vector multiply + @defgroup tpsv tpsv: triangular matrix-vector solve + + @defgroup blas2_banded --- banded --- + @defgroup gbmv gbmv: general matrix-vector multiply + + @defgroup hbmv {hb,sb}mv: Hermitian/symmetric matrix-vector multiply + + @defgroup tbmv tbmv: triangular matrix-vector multiply + @defgroup tbsv tbsv: triangular matrix-vector solve + @} + + @defgroup blas3_grp Level 3 BLAS: matrix-matrix ops + @{ + @defgroup gemm gemm: general matrix-matrix multiply + + @defgroup hemm {he,sy}mm: Hermitian/symmetric matrix-matrix multiply + @defgroup herk {he,sy}rk: Hermitian/symmetric rank-k update + @defgroup her2k {he,sy}r2k: Hermitian/symmetric rank-2k update + + @defgroup trmm trmm: triangular matrix-matrix multiply + @defgroup trsm trsm: triangular matrix-matrix solve + @} +@} + **/ diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex index 794c2a7aa..668ca8d68 100644 --- a/lapack-netlib/DOCS/lawn81.tex +++ b/lapack-netlib/DOCS/lawn81.tex @@ -575,7 +575,7 @@ There are six machine-dependent functions in the test and timing package, at least three of which must be installed. They are \begin{tabbing} -MONOMO \= DOUBLE PRECYSION \= \kill +MONOMO \= DOUBLE PRECISION \= \kill LSAME \> LOGICAL \> Test if two characters are the same regardless of case \\ SLAMCH \> REAL \> Determine machine-dependent parameters \\ DLAMCH \> DOUBLE PRECISION \> Determine machine-dependent parameters \\ diff --git a/lapack-netlib/INSTALL/dlamch.c b/lapack-netlib/INSTALL/dlamch.c index 744130a87..ce6b76a32 100644 --- a/lapack-netlib/INSTALL/dlamch.c +++ b/lapack-netlib/INSTALL/dlamch.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} static double dpow_ui(double x, integer n) { double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,217 +273,7 @@ static double dpow_ui(double x, integer n) { } return pow; } -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; - _Complex float zdotc = 0.0; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; - _Complex float zdotc = 0.0; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \author Univ. of Colorado Denver */ /* > \author NAG Ltd. */ -/* > \date November 2019 */ /* > \ingroup auxOTHERauxiliary */ @@ -573,7 +331,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* ===================================================================== */ *vers_major__ = 3; - *vers_minor__ = 11; + *vers_minor__ = 12; *vers_patch__ = 0; /* ===================================================================== */ diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f index a246c37cb..1827d5cd2 100644 --- a/lapack-netlib/INSTALL/ilaver.f +++ b/lapack-netlib/INSTALL/ilaver.f @@ -44,7 +44,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 * *> \ingroup auxOTHERauxiliary * @@ -60,7 +59,7 @@ INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH * ===================================================================== VERS_MAJOR = 3 - VERS_MINOR = 11 + VERS_MINOR = 12 VERS_PATCH = 0 * ===================================================================== * diff --git a/lapack-netlib/INSTALL/lsametst.c b/lapack-netlib/INSTALL/lsametst.c index 4b46115fc..631733841 100644 --- a/lapack-netlib/INSTALL/lsametst.c +++ b/lapack-netlib/INSTALL/lsametst.c @@ -426,7 +426,7 @@ static integer c__3 = 3; /* December 2016 */ /* ===================================================================== */ -/* Main program */ main(void) +/* Main program */ int main(void) { /* Format strings */ static char fmt_9999[] = "(\002 *** Error: LSAME( \002,a1,\002, \002," diff --git a/lapack-netlib/INSTALL/second_INT_ETIME.c b/lapack-netlib/INSTALL/second_INT_ETIME.c index 51b7ddb54..f2ad529e3 100644 --- a/lapack-netlib/INSTALL/second_INT_ETIME.c +++ b/lapack-netlib/INSTALL/second_INT_ETIME.c @@ -223,7 +223,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -237,149 +236,10 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; - _Complex float zdotc = 0.0; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; - _Complex float zdotc = 0.0; - if (incx == 1 && incy == 1) { - for (i=0;i + #define LAPACK_COMPLEX_CUSTOM + #define lapack_complex_float _Fcomplex + #define lapack_complex_double _Dcomplex + #define lapack_complex_float_real(z) (creal(z)) + #define lapack_complex_float_imag(z) (cimag(z)) + #define lapack_complex_double_real(z) (creal(z)) + #define lapack_complex_double_imag(z) (cimag(z)) +#else #if defined(LAPACK_COMPLEX_STRUCTURE) @@ -109,6 +120,7 @@ typedef struct { double real, imag; } _lapack_complex_double; #define lapack_complex_double_real(z) (creal(z)) #define lapack_complex_double_imag(z) (cimag(z)) +#endif #endif lapack_complex_float lapack_make_complex_float( float re, float im ); diff --git a/lapack-netlib/LAPACKE/src/CMakeLists.txt b/lapack-netlib/LAPACKE/src/CMakeLists.txt index 4171a3bd4..89890f923 100644 --- a/lapack-netlib/LAPACKE/src/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/src/CMakeLists.txt @@ -589,6 +589,8 @@ lapacke_cungtr.c lapacke_cungtr_work.c lapacke_cungtsqr_row.c lapacke_cungtsqr_row_work.c +lapacke_cunhr_col.c +lapacke_cunhr_col_work.c lapacke_cunmbr.c lapacke_cunmbr_work.c lapacke_cunmhr.c @@ -857,6 +859,8 @@ lapacke_dorgtr.c lapacke_dorgtr_work.c lapacke_dorgtsqr_row.c lapacke_dorgtsqr_row_work.c +lapacke_dorhr_col.c +lapacke_dorhr_col_work.c lapacke_dormbr.c lapacke_dormbr_work.c lapacke_dormhr.c @@ -1432,6 +1436,8 @@ lapacke_sorgtr.c lapacke_sorgtr_work.c lapacke_sorgtsqr_row.c lapacke_sorgtsqr_row_work.c +lapacke_sorhr_col.c +lapacke_sorhr_col_work.c lapacke_sormbr.c lapacke_sormbr_work.c lapacke_sormhr.c @@ -2346,6 +2352,8 @@ lapacke_zungtr.c lapacke_zungtr_work.c lapacke_zungtsqr_row.c lapacke_zungtsqr_row_work.c +lapacke_zunhr_col.c +lapacke_zunhr_col_work.c lapacke_zunmbr.c lapacke_zunmbr_work.c lapacke_zunmhr.c diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 9c02c1445..969288f42 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -28,7 +28,6 @@ ############################################################################## # Contents: Native C interface to LAPACK # Author: Intel Corporation -# September, 2010 ############################################################################## # makefile for LAPACKE, used to build lapacke binary. # @@ -138,6 +137,10 @@ lapacke_cgerqf.o \ lapacke_cgerqf_work.o \ lapacke_cgesdd.o \ lapacke_cgesdd_work.o \ +lapacke_cgedmd.o \ +lapacke_cgedmd_work.o \ +lapacke_cgedmdq.o \ +lapacke_cgedmdq_work.o \ lapacke_cgesv.o \ lapacke_cgesv_work.o \ lapacke_cgesvd.o \ @@ -608,6 +611,8 @@ lapacke_ctrsna.o \ lapacke_ctrsna_work.o \ lapacke_ctrsyl.o \ lapacke_ctrsyl_work.o \ +lapacke_ctrsyl3.o \ +lapacke_ctrsyl3_work.o \ lapacke_ctrtri.o \ lapacke_ctrtri_work.o \ lapacke_ctrtrs.o \ @@ -640,6 +645,8 @@ lapacke_cungtr.o \ lapacke_cungtr_work.o \ lapacke_cungtsqr_row.o \ lapacke_cungtsqr_row_work.o \ +lapacke_cunhr_col.o \ +lapacke_cunhr_col_work.o \ lapacke_cunmbr.o \ lapacke_cunmbr_work.o \ lapacke_cunmhr.o \ @@ -760,6 +767,10 @@ lapacke_dgerqf.o \ lapacke_dgerqf_work.o \ lapacke_dgesdd.o \ lapacke_dgesdd_work.o \ +lapacke_dgedmd.o \ +lapacke_dgedmd_work.o \ +lapacke_dgedmdq.o \ +lapacke_dgedmdq_work.o \ lapacke_dgesv.o \ lapacke_dgesv_work.o \ lapacke_dgesvd.o \ @@ -912,6 +923,8 @@ lapacke_dorgtr.o \ lapacke_dorgtr_work.o \ lapacke_dorgtsqr_row.o \ lapacke_dorgtsqr_row_work.o \ +lapacke_dorhr_col.o \ +lapacke_dorhr_col_work.o \ lapacke_dormbr.o \ lapacke_dormbr_work.o \ lapacke_dormhr.o \ @@ -1224,6 +1237,8 @@ lapacke_dtrsna.o \ lapacke_dtrsna_work.o \ lapacke_dtrsyl.o \ lapacke_dtrsyl_work.o \ +lapacke_dtrsyl3.o \ +lapacke_dtrsyl3_work.o \ lapacke_dtrtri.o \ lapacke_dtrtri_work.o \ lapacke_dtrtrs.o \ @@ -1336,6 +1351,10 @@ lapacke_sgerqf.o \ lapacke_sgerqf_work.o \ lapacke_sgesdd.o \ lapacke_sgesdd_work.o \ +lapacke_sgedmd.o \ +lapacke_sgedmd_work.o \ +lapacke_sgedmdq.o \ +lapacke_sgedmdq_work.o \ lapacke_sgesv.o \ lapacke_sgesv_work.o \ lapacke_sgesvd.o \ @@ -1486,6 +1505,8 @@ lapacke_sorgtr.o \ lapacke_sorgtr_work.o \ lapacke_sorgtsqr_row.o \ lapacke_sorgtsqr_row_work.o \ +lapacke_sorhr_col.o \ +lapacke_sorhr_col_work.o \ lapacke_sormbr.o \ lapacke_sormbr_work.o \ lapacke_sormhr.o \ @@ -1794,6 +1815,8 @@ lapacke_strsna.o \ lapacke_strsna_work.o \ lapacke_strsyl.o \ lapacke_strsyl_work.o \ +lapacke_strsyl3.o \ +lapacke_strsyl3_work.o \ lapacke_strtri.o \ lapacke_strtri_work.o \ lapacke_strtrs.o \ @@ -1902,6 +1925,10 @@ lapacke_zgerqf.o \ lapacke_zgerqf_work.o \ lapacke_zgesdd.o \ lapacke_zgesdd_work.o \ +lapacke_zgedmd.o \ +lapacke_zgedmd_work.o \ +lapacke_zgedmdq.o \ +lapacke_zgedmdq_work.o \ lapacke_zgesv.o \ lapacke_zgesv_work.o \ lapacke_zgesvd.o \ @@ -2372,6 +2399,8 @@ lapacke_ztrsna.o \ lapacke_ztrsna_work.o \ lapacke_ztrsyl.o \ lapacke_ztrsyl_work.o \ +lapacke_ztrsyl3.o \ +lapacke_ztrsyl3_work.o \ lapacke_ztrtri.o \ lapacke_ztrtri_work.o \ lapacke_ztrtrs.o \ @@ -2404,6 +2433,8 @@ lapacke_zungtr.o \ lapacke_zungtr_work.o \ lapacke_zungtsqr_row.o \ lapacke_zungtsqr_row_work.o \ +lapacke_zunhr_col.o \ +lapacke_zunhr_col_work.o \ lapacke_zunmbr.o \ lapacke_zunmbr_work.o \ lapacke_zunmhr.o \ @@ -2544,7 +2575,7 @@ $(LAPACKELIB): $(OBJ) $(OBJ_S) $(OBJ_C) $(OBJ_D) $(OBJ_Z) $(DEPRECATED) $(EXTEND ifdef BUILD_DEPRECATED $(AR) $(ARFLAGS) $@ $(DEPRECATED) endif -ifdef (USEXBLAS) +ifdef USEXBLAS $(AR) $(ARFLAGS) $@ $(EXTENDED) endif ifdef LAPACKE_WITH_TMG diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c new file mode 100644 index 000000000..6c77e199e --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c @@ -0,0 +1,127 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_float* x, + lapack_int ldx, lapack_complex_float* y, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, lapack_complex_float* eigs, + lapack_complex_float* z, lapack_int ldz, + float* res, lapack_complex_float* b, + lapack_int ldb, lapack_complex_float* w, + lapack_int ldw, lapack_complex_float* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + lapack_int lzwork = -1; + lapack_complex_float* zwork = NULL; + float* work = NULL; + lapack_int* iwork = NULL; + lapack_complex_float zwork_query; + float work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgedmd", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -8; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -10; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -15; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -18; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, w, ldw ) ) { + return -20; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -22; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, &zwork_query, + lzwork, &work_query, lwork, &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lzwork = LAPACK_C2INT( zwork_query ); + lwork = LAPACK_C2INT( work_query ); + liwork = iwork_query; + /* Allocate memory for work arrays */ + zwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork ); + if( zwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_2; + } + /* Call middle-level interface */ + info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_2: + LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgedmd", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c new file mode 100644 index 000000000..08d8b91f5 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c @@ -0,0 +1,184 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_float* x, lapack_int ldx, + lapack_complex_float* y, lapack_int ldy, lapack_int nrnk, + float* tol, lapack_int k, lapack_complex_float* eigs, + lapack_complex_float* z, lapack_int ldz, + float* res, lapack_complex_float* b, + lapack_int ldb, lapack_complex_float* w, + lapack_int ldw, lapack_complex_float* s, lapack_int lds, + lapack_complex_float* zwork, lapack_int lzwork, + float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, zwork, &lzwork, work, &lwork, iwork, &liwork, + &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldw_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + lapack_complex_float* x_t = NULL; + lapack_complex_float* y_t = NULL; + lapack_complex_float* z_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* w_t = NULL; + lapack_complex_float* s_t = NULL; + /* Check leading dimension(s) */ + if( ldx < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( ldy < n ) { + info = -11; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( ldz < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( ldb < n ) { + info = -19; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( ldw < n ) { + info = -21; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( lds < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, zwork, &lzwork, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + w_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldw_t * MAX(1,n) ); + if( w_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); + LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork, + &lzwork, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_5: + LAPACKE_free( w_t ); +exit_level_4: + LAPACKE_free( b_t ); +exit_level_3: + LAPACKE_free( z_t ); +exit_level_2: + LAPACKE_free( y_t ); +exit_level_1: + LAPACKE_free( x_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c new file mode 100644 index 000000000..b0b258f97 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c @@ -0,0 +1,133 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr, + char jobq, char jobt, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, lapack_complex_float* f, + lapack_int ldf, lapack_complex_float* x, + lapack_int ldx, lapack_complex_float* y, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, lapack_complex_float* eigs, + lapack_complex_float* z, lapack_int ldz, + float* res, lapack_complex_float* b, + lapack_int ldb, lapack_complex_float* v, + lapack_int ldv, lapack_complex_float* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + lapack_int lzwork = -1; + lapack_complex_float* zwork = NULL; + float* work = NULL; + lapack_int* iwork = NULL; + lapack_complex_float zwork_query; + float work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, f, ldf ) ) { + return -11; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -13; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -15; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -22; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -25; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, v, ldv ) ) { + return -27; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -29; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lzwork = LAPACK_C2INT( zwork_query ); + lwork = LAPACK_C2INT( work_query ); + liwork = iwork_query; + /* Allocate memory for work arrays */ + zwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork ); + if( zwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + work = (float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_2; + } + /* Call middle-level interface */ + info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_2: + LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgedmdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c new file mode 100644 index 000000000..05287c1bc --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c @@ -0,0 +1,205 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + lapack_complex_float* f, lapack_int ldf, + lapack_complex_float* x, lapack_int ldx, + lapack_complex_float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + lapack_complex_float* eigs, + lapack_complex_float* z, + lapack_int ldz, float* res, + lapack_complex_float* b, + lapack_int ldb, lapack_complex_float* v, + lapack_int ldv, lapack_complex_float* s, + lapack_int lds, lapack_complex_float *zwork, + lapack_int lzwork, float* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldf_t = MAX(1,m); + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldv_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + lapack_complex_float* f_t = NULL; + lapack_complex_float* x_t = NULL; + lapack_complex_float* y_t = NULL; + lapack_complex_float* z_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* v_t = NULL; + lapack_complex_float* s_t = NULL; + /* Check leading dimension(s) */ + if( ldf < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldx < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldy < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldz < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldb < n ) { + info = -26; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldv < n ) { + info = -28; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( lds < n ) { + info = -30; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lzwork == -1 || lwork == -1 || liwork == -1 ) { + LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + f_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldf_t * MAX(1,n) ); + if( f_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + v_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_6; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t ); + LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t ); + LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_6: + LAPACKE_free( v_t ); +exit_level_5: + LAPACKE_free( b_t ); +exit_level_4: + LAPACKE_free( z_t ); +exit_level_3: + LAPACKE_free( y_t ); +exit_level_2: + LAPACKE_free( x_t ); +exit_level_1: + LAPACKE_free( f_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c index 962624d21..3f8f0cf17 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c @@ -83,7 +83,7 @@ lapack_int LAPACKE_cgeqrt_work( int matrix_layout, lapack_int m, lapack_int n, } /* Transpose output matrices */ LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); - LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t, + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t, ldt ); /* Release memory and exit */ LAPACKE_free( t_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c index ed12b476e..aac7b551d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c index 545769b83..67bbbd34f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c @@ -60,7 +60,7 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; ldc_t = MAX(1,m); ldt_t = MAX(1,k); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c new file mode 100644 index 000000000..7ed1ad4c4 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c @@ -0,0 +1,24 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_float* a, + lapack_int lda, lapack_complex_float* t, + lapack_int ldt, lapack_complex_float* d) +{ + lapack_int info = 0; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cunhr_col", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -5; + } + } +#endif + /* Call middle-level interface */ + info = LAPACKE_cunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d ); + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c new file mode 100644 index 000000000..76b8366f0 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c @@ -0,0 +1,69 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_float* a, + lapack_int lda, lapack_complex_float* t, + lapack_int ldt, lapack_complex_float* d ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldt_t = MAX(1,MIN(nb,n)); + lapack_complex_float* a_t = NULL; + lapack_complex_float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -6; + LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info ); + return info; + } + if( ldt < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * + ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t, + ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c new file mode 100644 index 000000000..6802378da --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c @@ -0,0 +1,114 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, double* x, lapack_int ldx, double* y, + lapack_int ldy, lapack_int nrnk, double* tol, + lapack_int k, double* reig, double* imeig, + double* z, lapack_int ldz, + double* res, double* b, lapack_int ldb, + double* w, lapack_int ldw, double* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + double* work = NULL; + lapack_int* iwork = NULL; + double work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dgedmd", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -8; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -10; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -15; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -18; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -20; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, w, ldw ) ) { + return -22; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, &work_query, + lwork, &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int) work_query; + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, work, lwork, + iwork, liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgedmd", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c new file mode 100644 index 000000000..987709a1b --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c @@ -0,0 +1,181 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, double* x, + lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double* imeig, + double* z, lapack_int ldz, double* res, + double* b, lapack_int ldb, double* w, + lapack_int ldw, double* s, lapack_int lds, + double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldw_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + double* x_t = NULL; + double* y_t = NULL; + double* z_t = NULL; + double* b_t = NULL; + double* w_t = NULL; + double* s_t = NULL; + /* Check leading dimension(s) */ + if( ldx < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( ldy < n ) { + info = -11; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( ldz < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( ldb < n ) { + info = -19; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( ldw < n ) { + info = -21; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( lds < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, + y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, + &ldw, s, &lds, work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + w_t = (double*)LAPACKE_malloc( sizeof(double) * ldw_t * MAX(1,n) ); + if( w_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); + LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, + y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, res, b_t, + &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, &lwork, + iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_5: + LAPACKE_free( w_t ); +exit_level_4: + LAPACKE_free( b_t ); +exit_level_3: + LAPACKE_free( z_t ); +exit_level_2: + LAPACKE_free( y_t ); +exit_level_1: + LAPACKE_free( x_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c new file mode 100644 index 000000000..5c3c39308 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c @@ -0,0 +1,119 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgedmdq( int matrix_layout, char jobs, char jobz, char jobr, + char jobq, char jobt, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, double* f, lapack_int ldf, + double* x, lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double* imeig, double* z, + lapack_int ldz, double* res, double* b, lapack_int ldb, + double* v, lapack_int ldv, double* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + double* work = NULL; + lapack_int* iwork = NULL; + double work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dgedmdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, f, ldf ) ) { + return -11; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -13; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -15; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -22; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -25; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, v, ldv ) ) { + return -27; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -29; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int) work_query; + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgedmdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c new file mode 100644 index 000000000..149e6d24f --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c @@ -0,0 +1,200 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + double* f, lapack_int ldf, double* x, + lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double* imeig, double* z, + lapack_int ldz, double* res, double* b, + lapack_int ldb, double* v, lapack_int ldv, + double* s, lapack_int lds, double* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldf_t = MAX(1,m); + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldv_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + double* f_t = NULL; + double* x_t = NULL; + double* y_t = NULL; + double* z_t = NULL; + double* b_t = NULL; + double* v_t = NULL; + double* s_t = NULL; + /* Check leading dimension(s) */ + if( ldf < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldx < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldy < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldz < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldb < n ) { + info = -26; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldv < n ) { + info = -28; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( lds < n ) { + info = -30; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 || liwork == -1 ) { + LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + f_t = (double*)LAPACKE_malloc( sizeof(double) * ldf_t * MAX(1,n) ); + if( f_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + v_t = (double*)LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_6; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t ); + LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t ); + LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_6: + LAPACKE_free( v_t ); +exit_level_5: + LAPACKE_free( b_t ); +exit_level_4: + LAPACKE_free( z_t ); +exit_level_3: + LAPACKE_free( y_t ); +exit_level_2: + LAPACKE_free( x_t ); +exit_level_1: + LAPACKE_free( f_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c index 7796edffc..0460b6406 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c @@ -80,10 +80,13 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort, /* Allocate memory for work arrays */ if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); - if( iwork == NULL ) { - info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; - } + } + else { + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 ); + } + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; } work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); if( work == NULL ) { @@ -97,9 +100,7 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort, /* Release memory and exit */ LAPACKE_free( work ); exit_level_2: - if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { - LAPACKE_free( iwork ); - } + LAPACKE_free( iwork ); exit_level_1: if( LAPACKE_lsame( sort, 's' ) ) { LAPACKE_free( bwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c index 8a4c7cead..000c94e0b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c @@ -80,7 +80,7 @@ lapack_int LAPACKE_dgeqrt_work( int matrix_layout, lapack_int m, lapack_int n, } /* Transpose output matrices */ LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); - LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t, + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t, ldt ); /* Release memory and exit */ LAPACKE_free( t_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c index f4ddc62a5..aeebd8dec 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c index de444c146..de2f41e66 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c @@ -59,7 +59,7 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; ldc_t = MAX(1,m); ldt_t = MAX(1,k); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c new file mode 100644 index 000000000..1f37725e9 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c @@ -0,0 +1,24 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, double* a, + lapack_int lda, double* t, + lapack_int ldt, double* d) +{ + lapack_int info = 0; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dorhr_col", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -5; + } + } +#endif + /* Call middle-level interface */ + info = LAPACKE_dorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d ); + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c new file mode 100644 index 000000000..28b80cc02 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c @@ -0,0 +1,69 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, double* a, + lapack_int lda, double* t, + lapack_int ldt, double* d ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldt_t = MAX(1,MIN(nb,n)); + double* a_t = NULL; + double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -6; + LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info ); + return info; + } + if( ldt < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*) + LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (double*) + LAPACKE_malloc( sizeof(double) * + ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t, + ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_nancheck.c b/lapack-netlib/LAPACKE/src/lapacke_nancheck.c index c7d5c33f1..bb894f351 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_nancheck.c +++ b/lapack-netlib/LAPACKE/src/lapacke_nancheck.c @@ -39,7 +39,7 @@ void LAPACKE_set_nancheck( int flag ) nancheck_flag = ( flag ) ? 1 : 0; } -int LAPACKE_get_nancheck( ) +int LAPACKE_get_nancheck( void ) { char* env; if ( nancheck_flag != -1 ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c new file mode 100644 index 000000000..6865fcf65 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c @@ -0,0 +1,114 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, float* x, lapack_int ldx, float* y, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, float* reig, float* imeig, + float* z, lapack_int ldz, float* res, + float* b, lapack_int ldb, float* w, lapack_int ldw, + float* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + float* work = NULL; + lapack_int* iwork = NULL; + float work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sgedmd", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -8; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -10; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -15; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -18; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -20; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, w, ldw ) ) { + return -22; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, &work_query, + lwork, &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int) work_query; + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, work, lwork, + iwork, liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgedmd", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c new file mode 100644 index 000000000..5b24152da --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c @@ -0,0 +1,182 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, float* x, + lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float* imeig, + float* z, lapack_int ldz, float* res, + float* b, lapack_int ldb, float* w, + lapack_int ldw, float* s, lapack_int lds, + float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldw_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + float* x_t = NULL; + float* y_t = NULL; + float* z_t = NULL; + float* b_t = NULL; + float* w_t = NULL; + float* s_t = NULL; + /* Check leading dimension(s) */ + if( ldx < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( ldy < n ) { + info = -11; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( ldz < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( ldb < n ) { + info = -19; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( ldw < n ) { + info = -21; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( lds < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, work, &lwork, iwork, + &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + w_t = (float*)LAPACKE_malloc( sizeof(float) * ldw_t * MAX(1,n) ); + if( w_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); + LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, + &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_5: + LAPACKE_free( w_t ); +exit_level_4: + LAPACKE_free( b_t ); +exit_level_3: + LAPACKE_free( z_t ); +exit_level_2: + LAPACKE_free( y_t ); +exit_level_1: + LAPACKE_free( x_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c new file mode 100644 index 000000000..e65c2094f --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c @@ -0,0 +1,119 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgedmdq( int matrix_layout, char jobs, char jobz, char jobr, + char jobq, char jobt, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, float* f, lapack_int ldf, + float* x, lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float* imeig, float* z, + lapack_int ldz, float* res, float* b, lapack_int ldb, + float* v, lapack_int ldv, float* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + float* work = NULL; + lapack_int* iwork = NULL; + float work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sgedmdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, f, ldf ) ) { + return -11; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -13; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -15; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -22; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -25; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, v, ldv ) ) { + return -27; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -29; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int) work_query; + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgedmdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c new file mode 100644 index 000000000..e1c1f5c98 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c @@ -0,0 +1,200 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + float* f, lapack_int ldf, float* x, + lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float* imeig, float* z, + lapack_int ldz, float* res, float* b, + lapack_int ldb, float* v, lapack_int ldv, + float* s, lapack_int lds, float* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldf_t = MAX(1,m); + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldv_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + float* f_t = NULL; + float* x_t = NULL; + float* y_t = NULL; + float* z_t = NULL; + float* b_t = NULL; + float* v_t = NULL; + float* s_t = NULL; + /* Check leading dimension(s) */ + if( ldf < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldx < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldy < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldz < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldb < n ) { + info = -26; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldv < n ) { + info = -28; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( lds < n ) { + info = -30; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 || liwork == -1 ) { + LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + f_t = (float*)LAPACKE_malloc( sizeof(float) * ldf_t * MAX(1,n) ); + if( f_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_6; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t ); + LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t ); + LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_6: + LAPACKE_free( v_t ); +exit_level_5: + LAPACKE_free( b_t ); +exit_level_4: + LAPACKE_free( z_t ); +exit_level_3: + LAPACKE_free( y_t ); +exit_level_2: + LAPACKE_free( x_t ); +exit_level_1: + LAPACKE_free( f_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c index f7be44297..d2555ecc8 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c @@ -80,10 +80,13 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort, /* Allocate memory for work arrays */ if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); - if( iwork == NULL ) { - info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; - } + } + else { + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 ); + } + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; } work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); if( work == NULL ) { @@ -97,9 +100,7 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort, /* Release memory and exit */ LAPACKE_free( work ); exit_level_2: - if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { - LAPACKE_free( iwork ); - } + LAPACKE_free( iwork ); exit_level_1: if( LAPACKE_lsame( sort, 's' ) ) { LAPACKE_free( bwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c index 00ca6e501..d357845ae 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_sgeqrt_work( int matrix_layout, lapack_int m, lapack_int n, } /* Transpose output matrices */ LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); - LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t, + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t, ldt ); /* Release memory and exit */ LAPACKE_free( t_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c index d36958f93..3d6c29f88 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c index 8b6127633..72a392a77 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c @@ -59,7 +59,7 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; ldc_t = MAX(1,m); ldt_t = MAX(1,k); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c new file mode 100644 index 000000000..60e6e7951 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c @@ -0,0 +1,24 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, float* a, + lapack_int lda, float* t, + lapack_int ldt, float* d) +{ + lapack_int info = 0; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sorhr_col", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -5; + } + } +#endif + /* Call middle-level interface */ + info = LAPACKE_sorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d ); + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c new file mode 100644 index 000000000..56d6a965e --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c @@ -0,0 +1,69 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, float* a, + lapack_int lda, float* t, + lapack_int ldt, float* d ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldt_t = MAX(1,MIN(nb,n)); + float* a_t = NULL; + float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -6; + LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info ); + return info; + } + if( ldt < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*) + LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (float*) + LAPACKE_malloc( sizeof(float) * + ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t, + ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c new file mode 100644 index 000000000..e4ea4fe10 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c @@ -0,0 +1,128 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_double* x, + lapack_int ldx, lapack_complex_double* y, + lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, + lapack_int ldz, double* res, + lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* zw, lapack_int lzw, + lapack_complex_double* w, lapack_int ldw, + lapack_complex_double* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + lapack_int lzwork = -1; + lapack_complex_double* zwork = NULL; + double* work = NULL; + lapack_int* iwork = NULL; + double work_query; + lapack_int iwork_query; + lapack_complex_double zwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zgedmd", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -8; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -10; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -15; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -18; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -20; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, w, ldw ) ) { + return -22; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + liwork = iwork_query; + lzwork = LAPACK_Z2INT( zwork_query ); + /* Allocate memory for work arrays */ + zwork = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork ); + if( zwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_2; + } + /* Call middle-level interface */ + info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_2: + LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgedmd", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c new file mode 100644 index 000000000..ebacfaa94 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c @@ -0,0 +1,186 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, + lapack_complex_double* x, + lapack_int ldx, lapack_complex_double* y, + lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, + lapack_int ldz, double* res, + lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* w, lapack_int ldw, + lapack_complex_double* s, lapack_int lds, + lapack_complex_double* zwork, lapack_int lzwork, + double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, + y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, + &ldw, s, &lds, zwork, &lzwork, work, &lwork, iwork, + &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldw_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + lapack_complex_double* x_t = NULL; + lapack_complex_double* y_t = NULL; + lapack_complex_double* z_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* w_t = NULL; + lapack_complex_double* s_t = NULL; + /* Check leading dimension(s) */ + if( ldx < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( ldy < n ) { + info = -11; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( ldz < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( ldb < n ) { + info = -19; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( ldw < n ) { + info = -21; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( lds < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, zwork, &lzwork, work, + &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + w_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldw_t * MAX(1,n) ); + if( w_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); + LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork, + &lzwork, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_5: + LAPACKE_free( w_t ); +exit_level_4: + LAPACKE_free( b_t ); +exit_level_3: + LAPACKE_free( z_t ); +exit_level_2: + LAPACKE_free( y_t ); +exit_level_1: + LAPACKE_free( x_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c new file mode 100644 index 000000000..368d48e20 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c @@ -0,0 +1,133 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr, + char jobq, char jobt, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, lapack_complex_double* f, + lapack_int ldf, lapack_complex_double* x, + lapack_int ldx, lapack_complex_double* y, + lapack_int ldy, lapack_int nrnk, double* tol, + lapack_int k, lapack_complex_double* eigs, + lapack_complex_double* z, lapack_int ldz, + double* res, lapack_complex_double* b, + lapack_int ldb, lapack_complex_double* v, + lapack_int ldv, lapack_complex_double* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + lapack_int lzwork = -1; + lapack_complex_double* zwork = NULL; + double* work = NULL; + lapack_int* iwork = NULL; + double work_query; + lapack_complex_double zwork_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zgedmdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, f, ldf ) ) { + return -11; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -13; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -15; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -22; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -25; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, v, ldv ) ) { + return -27; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -29; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + lzwork = LAPACK_Z2INT( zwork_query ); + liwork = iwork_query; + /* Allocate memory for work arrays */ + zwork = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork ); + if( zwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + work = (double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_2; + } + /* Call middle-level interface */ + info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_2: + LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgedmdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c new file mode 100644 index 000000000..131e4f9ad --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c @@ -0,0 +1,205 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + lapack_complex_double* f, lapack_int ldf, + lapack_complex_double* x, lapack_int ldx, + lapack_complex_double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + lapack_complex_double* eigs, + lapack_complex_double* z, + lapack_int ldz, double* res, + lapack_complex_double* b, + lapack_int ldb, lapack_complex_double* v, + lapack_int ldv, lapack_complex_double* s, + lapack_int lds, lapack_complex_double* zwork, + lapack_int lzwork, double* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldf_t = MAX(1,m); + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldv_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + lapack_complex_double* f_t = NULL; + lapack_complex_double* x_t = NULL; + lapack_complex_double* y_t = NULL; + lapack_complex_double* z_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* v_t = NULL; + lapack_complex_double* s_t = NULL; + /* Check leading dimension(s) */ + if( ldf < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldx < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldy < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldz < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldb < n ) { + info = -26; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldv < n ) { + info = -28; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( lds < n ) { + info = -30; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 || liwork == -1 ) { + LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + f_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldf_t * MAX(1,n) ); + if( f_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + v_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_6; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t ); + LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t ); + LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_6: + LAPACKE_free( v_t ); +exit_level_5: + LAPACKE_free( b_t ); +exit_level_4: + LAPACKE_free( z_t ); +exit_level_3: + LAPACKE_free( y_t ); +exit_level_2: + LAPACKE_free( x_t ); +exit_level_1: + LAPACKE_free( f_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c index 8f6f1ed0b..633111533 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c @@ -83,7 +83,7 @@ lapack_int LAPACKE_zgeqrt_work( int matrix_layout, lapack_int m, lapack_int n, } /* Transpose output matrices */ LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); - LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t, + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t, ldt ); /* Release memory and exit */ LAPACKE_free( t_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c index 85355b202..c5edbbc0e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c index 72d85ec82..232c8ef58 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c @@ -60,7 +60,7 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; ldc_t = MAX(1,m); ldt_t = MAX(1,k); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c new file mode 100644 index 000000000..7e2507daf --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c @@ -0,0 +1,24 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_double* a, + lapack_int lda, lapack_complex_double* t, + lapack_int ldt, lapack_complex_double* d) +{ + lapack_int info = 0; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zunhr_col", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -5; + } + } +#endif + /* Call middle-level interface */ + info = LAPACKE_zunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d ); + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c new file mode 100644 index 000000000..b5e640177 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c @@ -0,0 +1,69 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_double* a, + lapack_int lda, lapack_complex_double* t, + lapack_int ldt, lapack_complex_double* d ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldt_t = MAX(1,MIN(nb,n)); + lapack_complex_double* a_t = NULL; + lapack_complex_double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -6; + LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info ); + return info; + } + if( ldt < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * + ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t, + ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c index 2adf71493..b32fc2f9e 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c @@ -54,7 +54,7 @@ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n, } } else if ( matrix_layout == LAPACK_ROW_MAJOR ) { /* TODO: interchange loops for performance. - * This is just reference impemeltation. + * This is just reference implementation. */ for( j = 0; j < MIN( n, ldin ); j++ ) { for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 ); diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c index 6624936a6..89e421eae 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c @@ -54,7 +54,7 @@ void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n, } } else if ( matrix_layout == LAPACK_ROW_MAJOR ) { /* TODO: interchange loops for performance. - * This is just reference impemeltation. + * This is just reference implementation. */ for( j = 0; j < MIN( n, ldin ); j++ ) { for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 ); diff --git a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c index 046fdb9ca..a90c9617a 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c @@ -54,7 +54,7 @@ void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n, } } else if ( matrix_layout == LAPACK_ROW_MAJOR ) { /* TODO: interchange loops for performance. - * This is just reference impemeltation. + * This is just reference implementation. */ for( j = 0; j < MIN( n, ldin ); j++ ) { for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 ); diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c index 32a9de379..63323b1c3 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c @@ -54,7 +54,7 @@ void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n, } } else if ( matrix_layout == LAPACK_ROW_MAJOR ) { /* TODO: interchange loops for performance. - * This is just reference impemeltation + * This is just reference implementation */ for( j = 0; j < MIN( n, ldin ); j++ ) { for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 ); diff --git a/lapack-netlib/LICENSE b/lapack-netlib/LICENSE index 94cdb0f85..96b04c988 100644 --- a/lapack-netlib/LICENSE +++ b/lapack-netlib/LICENSE @@ -1,9 +1,9 @@ -Copyright (c) 1992-2017 The University of Tennessee and The University +Copyright (c) 1992-2023 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. -Copyright (c) 2000-2017 The University of California Berkeley. All +Copyright (c) 2000-2023 The University of California Berkeley. All rights reserved. -Copyright (c) 2006-2017 The University of Colorado Denver. All rights +Copyright (c) 2006-2023 The University of Colorado Denver. All rights reserved. $COPYRIGHT$ diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md index 142aa7b72..a00d4c51d 100644 --- a/lapack-netlib/README.md +++ b/lapack-netlib/README.md @@ -6,7 +6,7 @@ [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/) [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack) [![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions) - +[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/Reference-LAPACK/lapack/badge)](https://securityscorecards.dev/viewer/?uri=github.com/Reference-LAPACK/lapack) * VERSION 1.0 : February 29, 1992 * VERSION 1.0a : June 30, 1992 @@ -37,6 +37,7 @@ * VERSION 3.10.0 : June 2021 * VERSION 3.10.1 : April 2022 * VERSION 3.11.0 : November 2022 +* VERSION 3.12.0 : November 2023 LAPACK is a library of Fortran subroutines for solving the most commonly occurring problems in numerical linear algebra. diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.c b/lapack-netlib/SRC/DEPRECATED/cgegs.c index 35b59e683..270a05ebd 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgegs.c +++ b/lapack-netlib/SRC/DEPRECATED/cgegs.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief CGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B) * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/cgegv.c b/lapack-netlib/SRC/DEPRECATED/cgegv.c index 5a5ea919f..d485b05b6 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgegv.c +++ b/lapack-netlib/SRC/DEPRECATED/cgegv.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief CGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B). * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/cgelqs.c b/lapack-netlib/SRC/DEPRECATED/cgelqs.c new file mode 100644 index 000000000..ee6d56119 --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/cgelqs.c @@ -0,0 +1,479 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static complex c_b1 = {0.f,0.f}; +static complex c_b2 = {1.f,0.f}; + +/* > \brief \b CGELQS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Compute a minimum-norm solution */ +/* > f2cmin || A*X - B || */ +/* > using the LQ factorization */ +/* > A = L*Q */ +/* > computed by CGELQF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > Details of the LQ factorization of the original matrix A as */ +/* > returned by CGELQF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is COMPLEX array, dimension (M) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= N. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex_lin */ + +/* ===================================================================== */ +/* Subroutine */ int cgelqs_(integer *m, integer *n, integer *nrhs, complex * + a, integer *lda, complex *tau, complex *b, integer *ldb, complex * + work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *), claset_(char *, + integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *), cunmlq_(char *, char + *, integer *, integer *, integer *, complex *, integer *, complex + *, complex *, integer *, complex *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *m > *n) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*n)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGELQS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* Solve L*X = B(1:m,:) */ + + ctrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b2, &a[ + a_offset], lda, &b[b_offset], ldb); + +/* Set B(m+1:n,:) to zero */ + + if (*m < *n) { + i__1 = *n - *m; + claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[*m + 1 + b_dim1], ldb); + } + +/* B := Q' * B */ + + cunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, & + tau[1], &b[b_offset], ldb, &work[1], lwork, info); + + return 0; + +/* End of CGELQS */ + +} /* cgelqs_ */ + diff --git a/lapack-netlib/TESTING/LIN/cgelqs.f b/lapack-netlib/SRC/DEPRECATED/cgelqs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/cgelqs.f rename to lapack-netlib/SRC/DEPRECATED/cgelqs.f diff --git a/lapack-netlib/SRC/DEPRECATED/cgelsx.c b/lapack-netlib/SRC/DEPRECATED/cgelsx.c index bdc395acf..cb3c33323 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgelsx.c +++ b/lapack-netlib/SRC/DEPRECATED/cgelsx.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; + +/* > \brief \b CGEQRS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Solve the least squares problem */ +/* > f2cmin || A*X - B || */ +/* > using the QR factorization */ +/* > A = Q*R */ +/* > computed by CGEQRF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. M >= N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > Details of the QR factorization of the original matrix A as */ +/* > returned by CGEQRF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is COMPLEX array, dimension (N) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex_lin */ + +/* ===================================================================== */ +/* Subroutine */ int cgeqrs_(integer *m, integer *n, integer *nrhs, complex * + a, integer *lda, complex *tau, complex *b, integer *ldb, complex * + work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *), xerbla_(char *, + integer *), cunmqr_(char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, complex *, integer *, + complex *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *n > *m) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*m)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGEQRS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* B := Q' * B */ + + cunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, & + tau[1], &b[b_offset], ldb, &work[1], lwork, info); + +/* Solve R*X = B(1:n,:) */ + + ctrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[ + a_offset], lda, &b[b_offset], ldb); + + return 0; + +/* End of CGEQRS */ + +} /* cgeqrs_ */ + diff --git a/lapack-netlib/TESTING/LIN/cgeqrs.f b/lapack-netlib/SRC/DEPRECATED/cgeqrs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/cgeqrs.f rename to lapack-netlib/SRC/DEPRECATED/cgeqrs.f diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvd.c b/lapack-netlib/SRC/DEPRECATED/cggsvd.c index d6f7b5903..1a8a827c1 100644 --- a/lapack-netlib/SRC/DEPRECATED/cggsvd.c +++ b/lapack-netlib/SRC/DEPRECATED/cggsvd.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i In particular, if B is an N-by-N nonsingular matrix, then the GSVD of *> A and B implicitly gives the SVD of A*inv(B): *> A*inv(B) = U*(D1*inv(D2))*V**H. -*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also +*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also *> equal to the CS decomposition of A and B. Furthermore, the GSVD can *> be used to derive the solution of the eigenvalue problem: *> A**H*A x = lambda* B**H*B x. diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvp.c b/lapack-netlib/SRC/DEPRECATED/cggsvp.c index d29338f73..3bf587172 100644 --- a/lapack-netlib/SRC/DEPRECATED/cggsvp.c +++ b/lapack-netlib/SRC/DEPRECATED/cggsvp.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief DGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B) * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/dgegv.c b/lapack-netlib/SRC/DEPRECATED/dgegv.c index c52b4b7d2..38df37116 100644 --- a/lapack-netlib/SRC/DEPRECATED/dgegv.c +++ b/lapack-netlib/SRC/DEPRECATED/dgegv.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief DGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B). * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/dgelqs.c b/lapack-netlib/SRC/DEPRECATED/dgelqs.c new file mode 100644 index 000000000..e3cf1e029 --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/dgelqs.c @@ -0,0 +1,480 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublereal c_b7 = 1.; +static doublereal c_b9 = 0.; + +/* > \brief \b DGELQS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Compute a minimum-norm solution */ +/* > f2cmin || A*X - B || */ +/* > using the LQ factorization */ +/* > A = L*Q */ +/* > computed by DGELQF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > Details of the LQ factorization of the original matrix A as */ +/* > returned by DGELQF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is DOUBLE PRECISION array, dimension (M) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= N. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup double_lin */ + +/* ===================================================================== */ +/* Subroutine */ int dgelqs_(integer *m, integer *n, integer *nrhs, + doublereal *a, integer *lda, doublereal *tau, doublereal *b, integer * + ldb, doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + integer *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *), dlaset_( + char *, integer *, integer *, doublereal *, doublereal *, + doublereal *, integer *), xerbla_(char *, integer *), dormlq_(char *, char *, integer *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *m > *n) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*n)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGELQS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* Solve L*X = B(1:m,:) */ + + dtrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b7, &a[ + a_offset], lda, &b[b_offset], ldb); + +/* Set B(m+1:n,:) to zero */ + + if (*m < *n) { + i__1 = *n - *m; + dlaset_("Full", &i__1, nrhs, &c_b9, &c_b9, &b[*m + 1 + b_dim1], ldb); + } + +/* B := Q' * B */ + + dormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &tau[1], &b[ + b_offset], ldb, &work[1], lwork, info); + + return 0; + +/* End of DGELQS */ + +} /* dgelqs_ */ + diff --git a/lapack-netlib/TESTING/LIN/dgelqs.f b/lapack-netlib/SRC/DEPRECATED/dgelqs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/dgelqs.f rename to lapack-netlib/SRC/DEPRECATED/dgelqs.f diff --git a/lapack-netlib/SRC/DEPRECATED/dgelsx.c b/lapack-netlib/SRC/DEPRECATED/dgelsx.c index 86977c94e..5abeebe84 100644 --- a/lapack-netlib/SRC/DEPRECATED/dgelsx.c +++ b/lapack-netlib/SRC/DEPRECATED/dgelsx.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublereal c_b9 = 1.; + +/* > \brief \b DGEQRS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Solve the least squares problem */ +/* > f2cmin || A*X - B || */ +/* > using the QR factorization */ +/* > A = Q*R */ +/* > computed by DGEQRF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. M >= N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > Details of the QR factorization of the original matrix A as */ +/* > returned by DGEQRF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is DOUBLE PRECISION array, dimension (N) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup double_lin */ + +/* ===================================================================== */ +/* Subroutine */ int dgeqrs_(integer *m, integer *n, integer *nrhs, + doublereal *a, integer *lda, doublereal *tau, doublereal *b, integer * + ldb, doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + integer *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *), xerbla_( + char *, integer *), dormqr_(char *, char *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *n > *m) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*m)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGEQRS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* B := Q' * B */ + + dormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &tau[1], &b[ + b_offset], ldb, &work[1], lwork, info); + +/* Solve R*X = B(1:n,:) */ + + dtrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b9, &a[ + a_offset], lda, &b[b_offset], ldb); + + return 0; + +/* End of DGEQRS */ + +} /* dgeqrs_ */ + diff --git a/lapack-netlib/TESTING/LIN/dgeqrs.f b/lapack-netlib/SRC/DEPRECATED/dgeqrs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/dgeqrs.f rename to lapack-netlib/SRC/DEPRECATED/dgeqrs.f diff --git a/lapack-netlib/SRC/DEPRECATED/dggsvd.c b/lapack-netlib/SRC/DEPRECATED/dggsvd.c index 71a4010d4..e5993b833 100644 --- a/lapack-netlib/SRC/DEPRECATED/dggsvd.c +++ b/lapack-netlib/SRC/DEPRECATED/dggsvd.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief SGEGS computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief SGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B) * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/sgegv.c b/lapack-netlib/SRC/DEPRECATED/sgegv.c index eb129ab65..39b2d6fb6 100644 --- a/lapack-netlib/SRC/DEPRECATED/sgegv.c +++ b/lapack-netlib/SRC/DEPRECATED/sgegv.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief SGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief SGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B). * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/sgelqs.c b/lapack-netlib/SRC/DEPRECATED/sgelqs.c new file mode 100644 index 000000000..03034b0dc --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/sgelqs.c @@ -0,0 +1,472 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* Table of constant values */ + +static real c_b7 = 1.f; +static real c_b9 = 0.f; + +/* > \brief \b SGELQS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* REAL A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Compute a minimum-norm solution */ +/* > f2cmin || A*X - B || */ +/* > using the LQ factorization */ +/* > A = L*Q */ +/* > computed by SGELQF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > Details of the LQ factorization of the original matrix A as */ +/* > returned by SGELQF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is REAL array, dimension (M) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= N. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup single_lin */ + +/* ===================================================================== */ +/* Subroutine */ int sgelqs_(integer *m, integer *n, integer *nrhs, real *a, + integer *lda, real *tau, real *b, integer *ldb, real *work, integer * + lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + integer *, integer *, real *, real *, integer *, real *, integer * + ), xerbla_(char *, integer *), slaset_(char *, integer *, integer *, real *, real *, + real *, integer *), sormlq_(char *, char *, integer *, + integer *, integer *, real *, integer *, real *, real *, integer * + , real *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *m > *n) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*n)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGELQS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* Solve L*X = B(1:m,:) */ + + strsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b7, &a[ + a_offset], lda, &b[b_offset], ldb); + +/* Set B(m+1:n,:) to zero */ + + if (*m < *n) { + i__1 = *n - *m; + slaset_("Full", &i__1, nrhs, &c_b9, &c_b9, &b[*m + 1 + b_dim1], ldb); + } + +/* B := Q' * B */ + + sormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &tau[1], &b[ + b_offset], ldb, &work[1], lwork, info); + + return 0; + +/* End of SGELQS */ + +} /* sgelqs_ */ + diff --git a/lapack-netlib/TESTING/LIN/sgelqs.f b/lapack-netlib/SRC/DEPRECATED/sgelqs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/sgelqs.f rename to lapack-netlib/SRC/DEPRECATED/sgelqs.f diff --git a/lapack-netlib/SRC/DEPRECATED/sgelsx.c b/lapack-netlib/SRC/DEPRECATED/sgelsx.c index a887910c9..b2c480481 100644 --- a/lapack-netlib/SRC/DEPRECATED/sgelsx.c +++ b/lapack-netlib/SRC/DEPRECATED/sgelsx.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static real c_b9 = 1.f; + +/* > \brief \b SGEQRS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* REAL A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Solve the least squares problem */ +/* > f2cmin || A*X - B || */ +/* > using the QR factorization */ +/* > A = Q*R */ +/* > computed by SGEQRF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. M >= N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > Details of the QR factorization of the original matrix A as */ +/* > returned by SGEQRF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is REAL array, dimension (N) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup single_lin */ + +/* ===================================================================== */ +/* Subroutine */ int sgeqrs_(integer *m, integer *n, integer *nrhs, real *a, + integer *lda, real *tau, real *b, integer *ldb, real *work, integer * + lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + integer *, integer *, real *, real *, integer *, real *, integer * + ), xerbla_(char *, integer *), sormqr_(char *, char *, integer *, integer *, integer *, + real *, integer *, real *, real *, integer *, real *, integer *, + integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *n > *m) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*m)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGEQRS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* B := Q' * B */ + + sormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &tau[1], &b[ + b_offset], ldb, &work[1], lwork, info); + +/* Solve R*X = B(1:n,:) */ + + strsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b9, &a[ + a_offset], lda, &b[b_offset], ldb); + + return 0; + +/* End of SGEQRS */ + +} /* sgeqrs_ */ + diff --git a/lapack-netlib/TESTING/LIN/sgeqrs.f b/lapack-netlib/SRC/DEPRECATED/sgeqrs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/sgeqrs.f rename to lapack-netlib/SRC/DEPRECATED/sgeqrs.f diff --git a/lapack-netlib/SRC/DEPRECATED/sggsvd.c b/lapack-netlib/SRC/DEPRECATED/sggsvd.c index 0af712876..a10edf060 100644 --- a/lapack-netlib/SRC/DEPRECATED/sggsvd.c +++ b/lapack-netlib/SRC/DEPRECATED/sggsvd.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief ZGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B) * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/zgegv.c b/lapack-netlib/SRC/DEPRECATED/zgegv.c index 991a6ff11..8272bd918 100644 --- a/lapack-netlib/SRC/DEPRECATED/zgegv.c +++ b/lapack-netlib/SRC/DEPRECATED/zgegv.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief ZGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B). * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/zgelqs.c b/lapack-netlib/SRC/DEPRECATED/zgelqs.c new file mode 100644 index 000000000..b77ba906a --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/zgelqs.c @@ -0,0 +1,481 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublecomplex c_b1 = {0.,0.}; +static doublecomplex c_b2 = {1.,0.}; + +/* > \brief \b ZGELQS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX*16 A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Compute a minimum-norm solution */ +/* > f2cmin || A*X - B || */ +/* > using the LQ factorization */ +/* > A = L*Q */ +/* > computed by ZGELQF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > Details of the LQ factorization of the original matrix A as */ +/* > returned by ZGELQF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is COMPLEX*16 array, dimension (M) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= N. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX*16 array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16_lin */ + +/* ===================================================================== */ +/* Subroutine */ int zgelqs_(integer *m, integer *n, integer *nrhs, + doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, + integer *ldb, doublecomplex *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + integer *, integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *), + xerbla_(char *, integer *), zlaset_(char *, integer *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, + integer *), zunmlq_(char *, char *, integer *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *m > *n) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*n)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGELQS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* Solve L*X = B(1:m,:) */ + + ztrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b2, &a[ + a_offset], lda, &b[b_offset], ldb); + +/* Set B(m+1:n,:) to zero */ + + if (*m < *n) { + i__1 = *n - *m; + zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[*m + 1 + b_dim1], ldb); + } + +/* B := Q' * B */ + + zunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, & + tau[1], &b[b_offset], ldb, &work[1], lwork, info); + + return 0; + +/* End of ZGELQS */ + +} /* zgelqs_ */ + diff --git a/lapack-netlib/TESTING/LIN/zgelqs.f b/lapack-netlib/SRC/DEPRECATED/zgelqs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/zgelqs.f rename to lapack-netlib/SRC/DEPRECATED/zgelqs.f diff --git a/lapack-netlib/SRC/DEPRECATED/zgelsx.c b/lapack-netlib/SRC/DEPRECATED/zgelsx.c index f1f39eb9c..82d195550 100644 --- a/lapack-netlib/SRC/DEPRECATED/zgelsx.c +++ b/lapack-netlib/SRC/DEPRECATED/zgelsx.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; + +/* > \brief \b ZGEQRS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX*16 A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Solve the least squares problem */ +/* > f2cmin || A*X - B || */ +/* > using the QR factorization */ +/* > A = Q*R */ +/* > computed by ZGEQRF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. M >= N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > Details of the QR factorization of the original matrix A as */ +/* > returned by ZGEQRF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is COMPLEX*16 array, dimension (N) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX*16 array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16_lin */ + +/* ===================================================================== */ +/* Subroutine */ int zgeqrs_(integer *m, integer *n, integer *nrhs, + doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, + integer *ldb, doublecomplex *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + integer *, integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *), + xerbla_(char *, integer *), zunmqr_(char *, char *, + integer *, integer *, integer *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *n > *m) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*m)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGEQRS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* B := Q' * B */ + + zunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, & + tau[1], &b[b_offset], ldb, &work[1], lwork, info); + +/* Solve R*X = B(1:n,:) */ + + ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[ + a_offset], lda, &b[b_offset], ldb); + + return 0; + +/* End of ZGEQRS */ + +} /* zgeqrs_ */ + diff --git a/lapack-netlib/TESTING/LIN/zgeqrs.f b/lapack-netlib/SRC/DEPRECATED/zgeqrs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/zgeqrs.f rename to lapack-netlib/SRC/DEPRECATED/zgeqrs.f diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvd.c b/lapack-netlib/SRC/DEPRECATED/zggsvd.c index 8f1c7e46c..e9d723404 100644 --- a/lapack-netlib/SRC/DEPRECATED/zggsvd.c +++ b/lapack-netlib/SRC/DEPRECATED/zggsvd.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i In particular, if B is an N-by-N nonsingular matrix, then the GSVD of *> A and B implicitly gives the SVD of A*inv(B): *> A*inv(B) = U*(D1*inv(D2))*V**H. -*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also +*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also *> equal to the CS decomposition of A and B. Furthermore, the GSVD can *> be used to derive the solution of the eigenvalue problem: *> A**H*A x = lambda* B**H*B x. diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvp.c b/lapack-netlib/SRC/DEPRECATED/zggsvp.c index cbe9a9083..0409e97b1 100644 --- a/lapack-netlib/SRC/DEPRECATED/zggsvp.c +++ b/lapack-netlib/SRC/DEPRECATED/zggsvp.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,248 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b CPOTRF VARIANT: right looking block version of the algorithm, calli C>\details \b Purpose: C>\verbatim C> -C> CPOTRF computes the Cholesky factorization of a real Hermitian +C> CPOTRF computes the Cholesky factorization of a complex Hermitian C> positive definite matrix A. C> C> The factorization has the form @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f index 1575bd95c..e0a621b2e 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f index 67ebae335..f3b66a9e3 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f index 76d8bab61..dda42faf9 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f @@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: right looking block version of the algorithm, calli C>\details \b Purpose: C>\verbatim C> -C> ZPOTRF computes the Cholesky factorization of a real Hermitian +C> ZPOTRF computes the Cholesky factorization of a complex Hermitian C> positive definite matrix A. C> C> The factorization has the form @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f index d6149c08f..b9dffa4ec 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f @@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: top-looking block version of the algorithm, calling C>\details \b Purpose: C>\verbatim C> -C> CPOTRF computes the Cholesky factorization of a real symmetric +C> CPOTRF computes the Cholesky factorization of a complex Hermitian C> positive definite matrix A. C> C> The factorization has the form @@ -55,7 +55,7 @@ C> C> \param[in,out] A C> \verbatim C> A is COMPLEX array, dimension (LDA,N) -C> On entry, the symmetric matrix A. If UPLO = 'U', the leading +C> On entry, the Hermitian matrix A. If UPLO = 'U', the leading C> N-by-N upper triangular part of A contains the upper C> triangular part of the matrix A, and the strictly lower C> triangular part of A is not referenced. If UPLO = 'L', the @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f index e49200ea5..e68a559ba 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f index 65895502b..7401f8844 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f index f8b9e253c..80b7c7f43 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f @@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: top-looking block version of the algorithm, calling C>\details \b Purpose: C>\verbatim C> -C> ZPOTRF computes the Cholesky factorization of a real symmetric +C> ZPOTRF computes the Cholesky factorization of a complex Hermitian C> positive definite matrix A. C> C> The factorization has the form @@ -55,7 +55,7 @@ C> C> \param[in,out] A C> \verbatim C> A is COMPLEX*16 array, dimension (LDA,N) -C> On entry, the symmetric matrix A. If UPLO = 'U', the leading +C> On entry, the Hermitian matrix A. If UPLO = 'U', the leading C> N-by-N upper triangular part of A contains the upper C> triangular part of the matrix A, and the strictly lower C> triangular part of A is not referenced. If UPLO = 'L', the @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f index 46eaf33b9..d3f78b8be 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f @@ -23,7 +23,7 @@ C> \brief \b CGEQRF VARIANT: left-looking Level 3 BLAS version of the algorithm. C>\details \b Purpose: C>\verbatim C> -C> CGEQRF computes a QR factorization of a real M-by-N matrix A: +C> CGEQRF computes a QR factorization of a complex M-by-N matrix A: C> A = Q * R. C> C> This is the left-looking Level 3 BLAS version of the algorithm. @@ -172,12 +172,12 @@ C> EXTERNAL CGEQR2, CLARFB, CLARFT, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC CEILING, MAX, MIN, REAL * .. * .. External Functions .. INTEGER ILAENV - REAL SCEIL - EXTERNAL ILAENV, SCEIL + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. @@ -205,13 +205,13 @@ C> * * So here 4 x 4 is the last T stored in the workspace * - NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB * * optimal workspace = space for dlarfb + space for normal T's + space for the last T * LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) - LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + LLWORK = CEILING(REAL(LLWORK)/REAL(NB)) IF( K.EQ.0 ) THEN @@ -226,13 +226,13 @@ C> * Optimal workspace for dlarfb = MAX(1,N)*NT * LWKOPT = (LBWORK+LLWORK)*NB - WORK( 1 ) = (LWKOPT+NT*NT) + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT+NT*NT) ELSE - LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF @@ -414,7 +414,7 @@ C> END IF - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGEQRF diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f index 55cab8b23..dd8baab4d 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f @@ -172,12 +172,12 @@ C> EXTERNAL DGEQR2, DLARFB, DLARFT, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC CEILING, MAX, MIN, REAL * .. * .. External Functions .. INTEGER ILAENV - REAL SCEIL - EXTERNAL ILAENV, SCEIL + DOUBLE PRECISION DROUNDUP_LWORK + EXTERNAL ILAENV, DROUNDUP_LWORK * .. * .. Executable Statements .. @@ -205,13 +205,13 @@ C> * * So here 4 x 4 is the last T stored in the workspace * - NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB * * optimal workspace = space for dlarfb + space for normal T's + space for the last T * LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) - LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + LLWORK = CEILING(REAL(LLWORK)/REAL(NB)) IF( K.EQ.0 ) THEN @@ -226,13 +226,13 @@ C> * Optimal workspace for dlarfb = MAX(1,N)*NT * LWKOPT = (LBWORK+LLWORK)*NB - WORK( 1 ) = (LWKOPT+NT*NT) + WORK( 1 ) = DROUNDUP_LWORK(LWKOPT+NT*NT) ELSE - LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = DROUNDUP_LWORK(LWKOPT) END IF @@ -414,7 +414,7 @@ C> END IF - WORK( 1 ) = IWS + WORK( 1 ) = DROUNDUP_LWORK(IWS) RETURN * * End of DGEQRF diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f deleted file mode 100644 index a007360ba..000000000 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f +++ /dev/null @@ -1,86 +0,0 @@ -C> \brief \b SCEIL -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* REAL FUNCTION SCEIL( A ) -* -* .. Scalar Arguments .. -* REAL A -* .. -* -* ===================================================================== -* -* .. Intrinsic Functions .. -* INTRINSIC INT -* .. -* .. Executable Statements ..* -* -* IF (A-INT(A).EQ.0) THEN -* SCEIL = A -* ELSE IF (A.GT.0) THEN -* SCEIL = INT(A)+1; -* ELSE -* SCEIL = INT(A) -* END IF -* -* RETURN -* -* END -* Purpose -* ======= -* -C>\details \b Purpose: -C>\verbatim -C>\endverbatim -* -* Arguments: -* ========== -* -* -* Authors: -* ======== -* -C> \author Univ. of Tennessee -C> \author Univ. of California Berkeley -C> \author Univ. of Colorado Denver -C> \author NAG Ltd. -* -C> \date December 2016 -* -C> \ingroup variantsOTHERcomputational -* -* ===================================================================== - REAL FUNCTION SCEIL( A ) -* -* -- LAPACK computational routine -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* -* .. Scalar Arguments ..* - REAL A -* .. -* -* ===================================================================== -* -* .. Intrinsic Functions .. - INTRINSIC INT -* .. -* .. Executable Statements ..* -* - IF (A-INT(A).EQ.0) THEN - SCEIL = A - ELSE IF (A.GT.0) THEN - SCEIL = INT(A)+1; - ELSE - SCEIL = INT(A) - END IF - - RETURN -* - END diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f index d2ad13ced..93dc48fa5 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f @@ -172,12 +172,12 @@ C> EXTERNAL SGEQR2, SLARFB, SLARFT, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC CEILING, MAX, MIN, REAL * .. * .. External Functions .. INTEGER ILAENV - REAL SCEIL - EXTERNAL ILAENV, SCEIL + DOUBLE PRECISION DROUNDUP_LWORK + EXTERNAL ILAENV, DROUNDUP_LWORK * .. * .. Executable Statements .. @@ -205,13 +205,13 @@ C> * * So here 4 x 4 is the last T stored in the workspace * - NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB * * optimal workspace = space for dlarfb + space for normal T's + space for the last T * LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) - LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + LLWORK = CEILING(REAL(LLWORK)/REAL(NB)) IF( K.EQ.0 ) THEN @@ -226,13 +226,13 @@ C> * Optimal workspace for dlarfb = MAX(1,N)*NT * LWKOPT = (LBWORK+LLWORK)*NB - WORK( 1 ) = (LWKOPT+NT*NT) + WORK( 1 ) = DROUNDUP_LWORK(LWKOPT+NT*NT) ELSE - LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = DROUNDUP_LWORK(LWKOPT) END IF @@ -414,7 +414,7 @@ C> END IF - WORK( 1 ) = IWS + WORK( 1 ) = DROUNDUP_LWORK(IWS) RETURN * * End of SGEQRF diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f index 623b88a8a..3ef07bfc7 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f @@ -23,7 +23,7 @@ C> \brief \b ZGEQRF VARIANT: left-looking Level 3 BLAS of the algorithm. C>\details \b Purpose: C>\verbatim C> -C> ZGEQRF computes a QR factorization of a real M-by-N matrix A: +C> ZGEQRF computes a QR factorization of a complex M-by-N matrix A: C> A = Q * R. C> C> This is the left-looking Level 3 BLAS version of the algorithm. @@ -172,12 +172,12 @@ C> EXTERNAL ZGEQR2, ZLARFB, ZLARFT, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC CEILING, MAX, MIN, REAL * .. * .. External Functions .. INTEGER ILAENV - REAL SCEIL - EXTERNAL ILAENV, SCEIL + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. @@ -205,13 +205,13 @@ C> * * So here 4 x 4 is the last T stored in the workspace * - NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB * * optimal workspace = space for dlarfb + space for normal T's + space for the last T * LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) - LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + LLWORK = CEILING(REAL(LLWORK)/REAL(NB)) IF( K.EQ.0 ) THEN @@ -226,13 +226,13 @@ C> * Optimal workspace for dlarfb = MAX(1,N)*NT * LWKOPT = (LBWORK+LLWORK)*NB - WORK( 1 ) = (LWKOPT+NT*NT) + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT+NT*NT) ELSE - LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF @@ -414,7 +414,7 @@ C> END IF - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of ZGEQRF diff --git a/lapack-netlib/SRC/cbbcsd.f b/lapack-netlib/SRC/cbbcsd.f index 1cd1ffbf1..4d0c45efe 100644 --- a/lapack-netlib/SRC/cbbcsd.f +++ b/lapack-netlib/SRC/cbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/cbdsqr.f b/lapack-netlib/SRC/cbdsqr.f index 1d7c4e09d..cf1459ad2 100644 --- a/lapack-netlib/SRC/cbdsqr.f +++ b/lapack-netlib/SRC/cbdsqr.f @@ -204,6 +204,17 @@ *> algorithm through its inner loop. The algorithms stops *> (and so fails to converge) if the number of passes *> through the inner loop exceeds MAXITR*N**2. +*> +*> \endverbatim +* +*> \par Note: +* =========== +*> +*> \verbatim +*> Bug report from Cezary Dendek. +*> On November 3rd 2023, the INTEGER variable MAXIT = MAXITR*N**2 is +*> removed since it can overflow pretty easily (for N larger or equal +*> than 18,919). We instead use MAXITDIVN = MAXITR*N. *> \endverbatim * * Authors: @@ -214,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup bdsqr * * ===================================================================== SUBROUTINE CBDSQR( UPLO, N, NCVT, NRU, NCC, D, E, VT, LDVT, U, @@ -255,11 +266,11 @@ * .. * .. Local Scalars .. LOGICAL LOWER, ROTATE - INTEGER I, IDIR, ISUB, ITER, J, LL, LLL, M, MAXIT, NM1, - $ NM12, NM13, OLDLL, OLDM + INTEGER I, IDIR, ISUB, ITER, ITERDIVN, J, LL, LLL, M, + $ MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM REAL ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, - $ SINR, SLL, SMAX, SMIN, SMINL, SMINOA, + $ SINR, SLL, SMAX, SMIN, SMINOA, $ SN, THRESH, TOL, TOLMUL, UNFL * .. * .. External Functions .. @@ -372,7 +383,7 @@ DO 30 I = 1, N - 1 SMAX = MAX( SMAX, ABS( E( I ) ) ) 30 CONTINUE - SMINL = ZERO + SMIN = ZERO IF( TOL.GE.ZERO ) THEN * * Relative accuracy desired @@ -389,20 +400,21 @@ 40 CONTINUE 50 CONTINUE SMINOA = SMINOA / SQRT( REAL( N ) ) - THRESH = MAX( TOL*SMINOA, MAXITR*N*N*UNFL ) + THRESH = MAX( TOL*SMINOA, MAXITR*(N*(N*UNFL)) ) ELSE * * Absolute accuracy desired * - THRESH = MAX( ABS( TOL )*SMAX, MAXITR*N*N*UNFL ) + THRESH = MAX( ABS( TOL )*SMAX, MAXITR*(N*(N*UNFL)) ) END IF * * Prepare for main iteration loop for the singular values * (MAXIT is the maximum number of passes through the inner * loop permitted before nonconvergence signalled.) * - MAXIT = MAXITR*N*N - ITER = 0 + MAXITDIVN = MAXITR*N + ITERDIVN = 0 + ITER = -1 OLDLL = -1 OLDM = -1 * @@ -418,15 +430,18 @@ * IF( M.LE.1 ) $ GO TO 160 - IF( ITER.GT.MAXIT ) - $ GO TO 200 + IF( ITER.GE.N ) THEN + ITER = ITER - N + ITERDIVN = ITERDIVN + 1 + IF( ITERDIVN.GE.MAXITDIVN ) + $ GO TO 200 + END IF * * Find diagonal block of matrix to work on * IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH ) $ D( M ) = ZERO SMAX = ABS( D( M ) ) - SMIN = SMAX DO 70 LLL = 1, M - 1 LL = M - LLL ABSS = ABS( D( LL ) ) @@ -435,7 +450,6 @@ $ D( LL ) = ZERO IF( ABSE.LE.THRESH ) $ GO TO 80 - SMIN = MIN( SMIN, ABSS ) SMAX = MAX( SMAX, ABSS, ABSE ) 70 CONTINUE LL = 0 @@ -517,14 +531,14 @@ * apply convergence criterion forward * MU = ABS( D( LL ) ) - SMINL = MU + SMIN = MU DO 100 LLL = LL, M - 1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 100 CONTINUE END IF * @@ -545,14 +559,14 @@ * apply convergence criterion backward * MU = ABS( D( M ) ) - SMINL = MU + SMIN = MU DO 110 LLL = M - 1, LL, -1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 110 CONTINUE END IF END IF @@ -562,7 +576,7 @@ * Compute shift. First, test if shifting would ruin relative * accuracy, and if so set the shift to zero. * - IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE. + IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE. $ MAX( EPS, HNDRTH*TOL ) ) THEN * * Use a zero shift to avoid loss of relative accuracy diff --git a/lapack-netlib/SRC/cgebal.f b/lapack-netlib/SRC/cgebal.f index 5d1ebb026..3f54d3937 100644 --- a/lapack-netlib/SRC/cgebal.f +++ b/lapack-netlib/SRC/cgebal.f @@ -85,6 +85,7 @@ *> \verbatim *> ILO is INTEGER *> \endverbatim +*> *> \param[out] IHI *> \verbatim *> IHI is INTEGER @@ -154,6 +155,9 @@ *> *> Modified by Tzu-Yi Chen, Computer Science Division, University of *> California at Berkeley, USA +*> +*> Refactored by Evert Provoost, Department of Computer Science, +*> KU Leuven, Belgium *> \endverbatim *> * ===================================================================== @@ -183,8 +187,8 @@ PARAMETER ( FACTOR = 0.95E+0 ) * .. * .. Local Scalars .. - LOGICAL NOCONV - INTEGER I, ICA, IEXC, IRA, J, K, L, M + LOGICAL NOCONV, CANSWAP + INTEGER I, ICA, IRA, J, K, L REAL C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2 * .. @@ -195,10 +199,10 @@ EXTERNAL SISNAN, LSAME, ICAMAX, SLAMCH, SCNRM2 * .. * .. External Subroutines .. - EXTERNAL CSSCAL, CSWAP, XERBLA + EXTERNAL XERBLA, CSSCAL, CSWAP * .. * .. Intrinsic Functions .. - INTRINSIC ABS, AIMAG, MAX, MIN, REAL + INTRINSIC ABS, REAL, AIMAG, MAX, MIN * * Test the input parameters * @@ -216,176 +220,194 @@ RETURN END IF * - K = 1 - L = N +* Quick returns. * - IF( N.EQ.0 ) - $ GO TO 210 + IF( N.EQ.0 ) THEN + ILO = 1 + IHI = 0 + RETURN + END IF * IF( LSAME( JOB, 'N' ) ) THEN - DO 10 I = 1, N + DO I = 1, N SCALE( I ) = ONE - 10 CONTINUE - GO TO 210 + END DO + ILO = 1 + IHI = N + RETURN END IF * - IF( LSAME( JOB, 'S' ) ) - $ GO TO 120 -* -* Permutation to isolate eigenvalues if possible -* - GO TO 50 -* -* Row and column exchange. -* - 20 CONTINUE - SCALE( M ) = J - IF( J.EQ.M ) - $ GO TO 30 -* - CALL CSWAP( L, A( 1, J ), 1, A( 1, M ), 1 ) - CALL CSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA ) -* - 30 CONTINUE - GO TO ( 40, 80 )IEXC -* -* Search for rows isolating an eigenvalue and push them down. -* - 40 CONTINUE - IF( L.EQ.1 ) - $ GO TO 210 - L = L - 1 -* - 50 CONTINUE - DO 70 J = L, 1, -1 +* Permutation to isolate eigenvalues if possible. * - DO 60 I = 1, L - IF( I.EQ.J ) - $ GO TO 60 - IF( REAL( A( J, I ) ).NE.ZERO .OR. AIMAG( A( J, I ) ).NE. - $ ZERO )GO TO 70 - 60 CONTINUE -* - M = L - IEXC = 1 - GO TO 20 - 70 CONTINUE -* - GO TO 90 + K = 1 + L = N * -* Search for columns isolating an eigenvalue and push them left. + IF( .NOT.LSAME( JOB, 'S' ) ) THEN * - 80 CONTINUE - K = K + 1 +* Row and column exchange. * - 90 CONTINUE - DO 110 J = K, L + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for rows isolating an eigenvalue and push them down. +* + NOCONV = .FALSE. + DO I = L, 1, -1 + CANSWAP = .TRUE. + DO J = 1, L + IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR. + $ AIMAG( A( I, J ) ).NE.ZERO ) ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( L ) = I + IF( I.NE.L ) THEN + CALL CSWAP( L, A( 1, I ), 1, A( 1, L ), 1 ) + CALL CSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA ) + END IF + NOCONV = .TRUE. +* + IF( L.EQ.1 ) THEN + ILO = 1 + IHI = 1 + RETURN + END IF +* + L = L - 1 + END IF + END DO +* + END DO + + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for columns isolating an eigenvalue and push them left. +* + NOCONV = .FALSE. + DO J = K, L + CANSWAP = .TRUE. + DO I = K, L + IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR. + $ AIMAG( A( I, J ) ).NE.ZERO ) ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( K ) = J + IF( J.NE.K ) THEN + CALL CSWAP( L, A( 1, J ), 1, A( 1, K ), 1 ) + CALL CSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA ) + END IF + NOCONV = .TRUE. +* + K = K + 1 + END IF + END DO +* + END DO * - DO 100 I = K, L - IF( I.EQ.J ) - $ GO TO 100 - IF( REAL( A( I, J ) ).NE.ZERO .OR. AIMAG( A( I, J ) ).NE. - $ ZERO )GO TO 110 - 100 CONTINUE + END IF * - M = K - IEXC = 2 - GO TO 20 - 110 CONTINUE +* Initialize SCALE for non-permuted submatrix. * - 120 CONTINUE - DO 130 I = K, L + DO I = K, L SCALE( I ) = ONE - 130 CONTINUE + END DO * - IF( LSAME( JOB, 'P' ) ) - $ GO TO 210 +* If we only had to permute, we are done. +* + IF( LSAME( JOB, 'P' ) ) THEN + ILO = K + IHI = L + RETURN + END IF * * Balance the submatrix in rows K to L. * -* Iterative loop for norm reduction +* Iterative loop for norm reduction. * SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 - 140 CONTINUE - NOCONV = .FALSE. -* - DO 200 I = K, L -* - C = SCNRM2( L-K+1, A( K, I ), 1 ) - R = SCNRM2( L-K+1, A( I , K ), LDA ) - ICA = ICAMAX( L, A( 1, I ), 1 ) - CA = ABS( A( ICA, I ) ) - IRA = ICAMAX( N-K+1, A( I, K ), LDA ) - RA = ABS( A( I, IRA+K-1 ) ) -* -* Guard against zero C or R due to underflow. -* - IF( C.EQ.ZERO .OR. R.EQ.ZERO ) - $ GO TO 200 - G = R / SCLFAC - F = ONE - S = C + R - 160 CONTINUE - IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. - $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 - IF( SISNAN( C+F+CA+R+G+RA ) ) THEN * -* Exit if NaN to avoid infinite loop + NOCONV = .TRUE. + DO WHILE( NOCONV ) + NOCONV = .FALSE. * - INFO = -3 - CALL XERBLA( 'CGEBAL', -INFO ) - RETURN - END IF - F = F*SCLFAC - C = C*SCLFAC - CA = CA*SCLFAC - R = R / SCLFAC - G = G / SCLFAC - RA = RA / SCLFAC - GO TO 160 -* - 170 CONTINUE - G = C / SCLFAC - 180 CONTINUE - IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. - $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 - F = F / SCLFAC - C = C / SCLFAC - G = G / SCLFAC - CA = CA / SCLFAC - R = R*SCLFAC - RA = RA*SCLFAC - GO TO 180 -* -* Now balance. -* - 190 CONTINUE - IF( ( C+R ).GE.FACTOR*S ) - $ GO TO 200 - IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN - IF( F*SCALE( I ).LE.SFMIN1 ) - $ GO TO 200 - END IF - IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN - IF( SCALE( I ).GE.SFMAX1 / F ) - $ GO TO 200 - END IF - G = ONE / F - SCALE( I ) = SCALE( I )*F - NOCONV = .TRUE. + DO I = K, L * - CALL CSSCAL( N-K+1, G, A( I, K ), LDA ) - CALL CSSCAL( L, F, A( 1, I ), 1 ) + C = SCNRM2( L-K+1, A( K, I ), 1 ) + R = SCNRM2( L-K+1, A( I, K ), LDA ) + ICA = ICAMAX( L, A( 1, I ), 1 ) + CA = ABS( A( ICA, I ) ) + IRA = ICAMAX( N-K+1, A( I, K ), LDA ) + RA = ABS( A( I, IRA+K-1 ) ) * - 200 CONTINUE +* Guard against zero C or R due to underflow. +* + IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE +* +* Exit if NaN to avoid infinite loop * - IF( NOCONV ) - $ GO TO 140 + IF( SISNAN( C+CA+R+RA ) ) THEN + INFO = -3 + CALL XERBLA( 'CGEBAL', -INFO ) + RETURN + END IF +* + G = R / SCLFAC + F = ONE + S = C + R +* + DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND. + $ MIN( R, G, RA ).GT.SFMIN2 ) + F = F*SCLFAC + C = C*SCLFAC + CA = CA*SCLFAC + R = R / SCLFAC + G = G / SCLFAC + RA = RA / SCLFAC + END DO +* + G = C / SCLFAC +* + DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND. + $ MIN( F, C, G, CA ).GT.SFMIN2 ) + F = F / SCLFAC + C = C / SCLFAC + G = G / SCLFAC + CA = CA / SCLFAC + R = R*SCLFAC + RA = RA*SCLFAC + END DO +* +* Now balance. +* + IF( ( C+R ).GE.FACTOR*S ) CYCLE + IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN + IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE + END IF + IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN + IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE + END IF + G = ONE / F + SCALE( I ) = SCALE( I )*F + NOCONV = .TRUE. +* + CALL CSSCAL( N-K+1, G, A( I, K ), LDA ) + CALL CSSCAL( L, F, A( 1, I ), 1 ) +* + END DO +* + END DO * - 210 CONTINUE ILO = K IHI = L * diff --git a/lapack-netlib/SRC/cgebrd.f b/lapack-netlib/SRC/cgebrd.f index 5687161a5..5920b1cf5 100644 --- a/lapack-netlib/SRC/cgebrd.f +++ b/lapack-netlib/SRC/cgebrd.f @@ -123,7 +123,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -148,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -225,8 +226,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL CGEBD2, CGEMM, CLABRD, XERBLA @@ -236,16 +237,24 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = REAL( LWKOPT ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -253,7 +262,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -265,7 +274,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -284,7 +292,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -343,7 +351,7 @@ * CALL CGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = WS + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of CGEBRD diff --git a/lapack-netlib/SRC/cgecon.f b/lapack-netlib/SRC/cgecon.f index 6f426c2ab..e018b18bb 100644 --- a/lapack-netlib/SRC/cgecon.f +++ b/lapack-netlib/SRC/cgecon.f @@ -105,8 +105,15 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> =-5: if ANORM is NAN or negative. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> NaNs are illegal values for ANORM, and they propagate to +*> the output parameter RCOND. +*> Infinity is illegal for ANORM, and it propagates to the output +*> parameter RCOND as 0. +*> = 1: if RCOND = NaN, or +*> RCOND = Inf, or +*> the computed norm of the inverse of A is 0. +*> In the latter, RCOND = 0 is returned. *> \endverbatim * * Authors: @@ -117,7 +124,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gecon * * ===================================================================== SUBROUTINE CGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK, @@ -147,7 +154,7 @@ LOGICAL ONENRM CHARACTER NORMIN INTEGER IX, KASE, KASE1 - REAL AINVNM, SCALE, SL, SMLNUM, SU + REAL AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL COMPLEX ZDUM * .. * .. Local Arrays .. @@ -172,6 +179,8 @@ CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. +* + HUGEVAL = SLAMCH( 'Overflow' ) * * Test the input parameters. * @@ -183,7 +192,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN + ELSE IF( ANORM.LT.ZERO ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN @@ -199,6 +208,13 @@ RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN + ELSE IF( SISNAN( ANORM ) ) THEN + RCOND = ANORM + INFO = -5 + RETURN + ELSE IF( ANORM.GT.HUGEVAL ) THEN + INFO = -5 + RETURN END IF * SMLNUM = SLAMCH( 'Safe minimum' ) @@ -256,8 +272,17 @@ * * Compute the estimate of the reciprocal condition number. * - IF( AINVNM.NE.ZERO ) - $ RCOND = ( ONE / AINVNM ) / ANORM + IF( AINVNM.NE.ZERO ) THEN + RCOND = ( ONE / AINVNM ) / ANORM + ELSE + INFO = 1 + RETURN + END IF +* +* Check for NaNs and Infs +* + IF( SISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL ) + $ INFO = 1 * 20 CONTINUE RETURN diff --git a/lapack-netlib/SRC/cgedmd.c b/lapack-netlib/SRC/cgedmd.c new file mode 100644 index 000000000..570395c7b --- /dev/null +++ b/lapack-netlib/SRC/cgedmd.c @@ -0,0 +1,1670 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0 */ +/* The state space dimension (the row dimension of X, Y). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshot pairs */ +/* (the number of columns of X and Y). */ +/* ..... */ +/* X (input/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, X contains the data snapshot matrix X. It is */ +/* assumed that the column norms of X are in the range of */ +/* the normalized floating point numbers. */ +/* < On exit, the leading K columns of X contain a POD basis, */ +/* i.e. the leading K left singular vectors of the input */ +/* data matrix X, U(:,1:K). All N columns of X contain all */ +/* left singular vectors of the input matrix X. */ +/* See the descriptions of K, Z and W. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= M */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, Y contains the data snapshot matrix Y */ +/* < On exit, */ +/* If JOBR == 'R', the leading K columns of Y contain */ +/* the residual vectors for the computed Ritz pairs. */ +/* See the description of RES. */ +/* If JOBR == 'N', Y contains the original input data, */ +/* scaled according to the value of JOBS. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= M */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the descriptions of TOL and K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the POD basis for the data snapshot */ +/* matrix X and the number of the computed Ritz pairs. */ +/* The value of K is determined according to the rule set */ +/* by the parameters NRNK and TOL. */ +/* See the descriptions of NRNK and TOL. */ +/* ..... */ +/* EIGS (output) COMPLEX(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of EIGS contain */ +/* the computed eigenvalues (Ritz values). */ +/* See the descriptions of K, and Z. */ +/* ..... */ +/* Z (workspace/output) COMPLEX(KIND=WP) M-by-N array */ +/* If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */ +/* is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */ +/* the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */ +/* is an eigenvector corresponding to EIGS(i). The columns */ +/* of W(1:k,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. */ +/* See the descriptions of EIGS, X and W. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) N-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs, */ +/* RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */ +/* See the description of EIGS and Z. */ +/* ..... */ +/* B (output) COMPLEX(KIND=WP) M-by-N array. */ +/* IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:M,1:K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* If JOBF =='N', then B is not referenced. */ +/* See the descriptions of X, W, K. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= M */ +/* The leading dimension of the array B. */ +/* ..... */ +/* W (workspace/output) COMPLEX(KIND=WP) N-by-N array */ +/* On exit, W(1:K,1:K) contains the K computed */ +/* eigenvectors of the matrix Rayleigh quotient. */ +/* The Ritz vectors (returned in Z) are the */ +/* product of X (containing a POD basis for the input */ +/* matrix X) and W. See the descriptions of K, S, X and Z. */ +/* W is also used as a workspace to temporarily store the */ +/* right singular vectors of X. */ +/* ..... */ +/* LDW (input) INTEGER, LDW >= N */ +/* The leading dimension of the array W. */ +/* ..... */ +/* S (workspace/output) COMPLEX(KIND=WP) N-by-N array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by CGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N */ +/* The leading dimension of the array S. */ +/* ..... */ +/* ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */ +/* ZWORK is used as complex workspace in the complex SVD, as */ +/* specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing */ +/* the eigenvalues of a Rayleigh quotient. */ +/* If the call to CGEDMD is only workspace query, then */ +/* ZWORK(1) contains the minimal complex workspace length and */ +/* ZWORK(2) is the optimal complex workspace length. */ +/* Hence, the length of work is at least 2. */ +/* See the description of LZWORK. */ +/* ..... */ +/* LZWORK (input) INTEGER */ +/* The minimal length of the workspace vector ZWORK. */ +/* LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), */ +/* where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal */ +/* LZWORK_SVD is calculated as follows */ +/* If WHTSVD == 1 :: CGESVD :: */ +/* LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */ +/* If WHTSVD == 2 :: CGESDD :: */ +/* LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */ +/* If WHTSVD == 3 :: CGESVDQ :: */ +/* LZWORK_SVD = obtainable by a query */ +/* If WHTSVD == 4 :: CGEJSV :: */ +/* LZWORK_SVD = obtainable by a query */ +/* If on entry LZWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths and returns them in */ +/* LZWORK(1) and LZWORK(2), respectively. */ +/* ..... */ +/* RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */ +/* On exit, RWORK(1:N) contains the singular values of */ +/* X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */ +/* If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */ +/* scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */ +/* and Y to avoid overflow in the SVD of X. */ +/* This may be of interest if the scaling option is off */ +/* and as many as possible smallest eigenvalues are */ +/* desired to the highest feasible accuracy. */ +/* If the call to CGEDMD is only workspace query, then */ +/* RWORK(1) contains the minimal workspace length. */ +/* See the description of LRWORK. */ +/* ..... */ +/* LRWORK (input) INTEGER */ +/* The minimal length of the workspace vector RWORK. */ +/* LRWORK is calculated as follows: */ +/* LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where */ +/* LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */ +/* for the SVD subroutine determined by the input parameter */ +/* WHTSVD. */ +/* If WHTSVD == 1 :: CGESVD :: */ +/* LRWORK_SVD = 5*MIN(M,N) */ +/* If WHTSVD == 2 :: CGESDD :: */ +/* LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */ +/* 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */ +/* If WHTSVD == 3 :: CGESVDQ :: */ +/* LRWORK_SVD = obtainable by a query */ +/* If WHTSVD == 4 :: CGEJSV :: */ +/* LRWORK_SVD = obtainable by a query */ +/* If on entry LRWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* real workspace length and returns it in RWORK(1). */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for ZWORK, RWORK and */ +/* IWORK. See the descriptions of ZWORK, RWORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* Local arrays */ +/* ~~~~~~~~~~~~ */ +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* ............................................................ */ + /* Parameter adjustments */ + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --eigs; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + w_dim1 = *ldw; + w_offset = 1 + w_dim1 * 1; + w -= w_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --zwork; + --rwork; + --iwork; + + /* Function Body */ + zero = 0.f; + one = 1.f; + zzero.r = 0.f, zzero.i = 0.f; + zone.r = 1.f, zone.i = 0.f; + +/* Test the input arguments */ + + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + *info = 0; + lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || lsame_(jobz, "N") || lsame_( + jobz, "F"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && ! wntvec) { + *info = -3; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -4; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -5; + } else if (*m < 0) { + *info = -6; + } else if (*n < 0 || *n > *m) { + *info = -7; + } else if (*ldx < *m) { + *info = -9; + } else if (*ldy < *m) { + *info = -11; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -12; + } else if (*tol < zero || *tol >= one) { + *info = -13; + } else if (*ldz < *m) { + *info = -17; + } else if ((wntref || wntex) && *ldb < *m) { + *info = -20; + } else if (*ldw < *n) { + *info = -22; + } else if (*lds < *n) { + *info = -24; + } + + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0) { +/* Quick return. All output except K is void. */ +/* INFO=1 signals the void input. */ +/* In case of a workspace query, the default */ +/* minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + rwork[1] = 1.f; + zwork[1].r = 2.f, zwork[1].i = 0.f; + zwork[2].r = 2.f, zwork[2].i = 0.f; + } else { + *k = 0; + } + *info = 1; + return 0; + } + iminwr = 1; + mlrwrk = f2cmax(1,*n); + mlwork = 2; + olwork = 2; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of CGESVD: */ +/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */ +/* Computing MAX */ + i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n); + mwrsvd = f2cmax(i__1,i__2); + mlwork = f2cmax(mlwork,mwrsvd); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5; + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[ + b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, + rdummy, &info1); + lwrsvd = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvd); + } + } else if (*whtsvd == 2) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of CGESDD: */ +/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */ +/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */ +/* In LAPACK 3.10.1 RWORK is defined differently. */ +/* Below we take f2cmax over the two versions. */ +/* IMINWR = 8*MIN(M,N) */ + mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax( + *m,*n); + mlwork = f2cmax(mlwork,mwrsdd); + iminwr = f2cmin(*m,*n) << 3; +/* Computing MAX */ +/* Computing MAX */ + i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(* + m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3, + i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) + << 1) * f2cmin(*m,*n) + f2cmin(*m,*n); + i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], + ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, & + iwork[1], &info1); +/* Computing MAX */ + i__1 = mwrsdd, i__2 = (integer) zwork[1].r; + lwrsdd = f2cmax(i__1,i__2); + olwork = f2cmax(olwork,lwrsdd); + } + } else if (*whtsvd == 3) { + cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[ + 1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, & + iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1); + iminwr = iwork[1]; + mwrsvq = (integer) zwork[2].r; + mlwork = f2cmax(mlwork,mwrsvq); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0]; + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + lwrsvq = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvq); + } + } else if (*whtsvd == 4) { + *(unsigned char *)jsvopt = 'J'; + cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, + &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[ + 1], &c_n1, rdummy, &c_n1, &iwork[1], &info1); + iminwr = iwork[1]; + mwrsvj = (integer) zwork[2].r; + mlwork = f2cmax(mlwork,mwrsvj); +/* Computing MAX */ +/* Computing MAX */ + i__3 = 7, i__4 = (integer) rdummy[0]; + i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + lwrsvj = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvj); + } +/* END SELECT */ + } + if (wntvec || wntex || lsame_(jobz, "F")) { + *(unsigned char *)jobzl = 'V'; + } else { + *(unsigned char *)jobzl = 'N'; + } +/* Workspace calculation to the CGEEV call */ +/* Computing MAX */ + i__1 = 1, i__2 = *n << 1; + mwrkev = f2cmax(i__1,i__2); + mlwork = f2cmax(mlwork,mwrkev); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + (*n << 1); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + cgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], + ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], & + info1); +/* LAPACK CALL */ + lwrkev = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrkev); + olwork = f2cmax(2,olwork); + } + + if (*liwork < iminwr && ! lquery) { + *info = -30; + } + if (*lrwork < mlrwrk && ! lquery) { + *info = -28; + } + if (*lzwork < mlwork && ! lquery) { + *info = -26; + } + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGEDMD", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + rwork[1] = (real) mlrwrk; + zwork[1].r = (real) mlwork, zwork[1].i = 0.f; + zwork[2].r = (real) olwork, zwork[2].i = 0.f; + return 0; + } +/* ............................................................ */ + + ofl = slamch_("O") * slamch_("P"); + small = slamch_("S"); + badxy = FALSE_; + +/* <1> Optional scaling of the snapshots (columns of X, Y) */ +/* ========================================================== */ + if (sccolx) { +/* The columns of X will be normalized. */ +/* To prevent overflows, the column norms of X are */ +/* carefully computed using CLASSQ. */ + *k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = SCNRM2( M, X(1,i), 1 ) */ + scale = zero; + classq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum); + if (sisnan_(&scale) || sisnan_(&ssum)) { + *k = 0; + *info = -8; + i__2 = -(*info); + xerbla_("CGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of X(:,i) overflows. First, X(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */ +/* Next, the norm of X(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of X(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + r__1 = one / rootsc; + clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[ + i__ * x_dim1 + 1], ldx, &info2); + rwork[i__] = -scale * (rootsc / (real) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + rwork[i__] = scale * rootsc; + clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, & + x[i__ * x_dim1 + 1], ldx, &info2); +/* X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC */ +/* LAPAC */ + } + } else { + rwork[i__] = zero; + ++(*k); + } + } + if (*k == *n) { +/* All columns of X are zero. Return error code -8. */ +/* (the 8th input variable had an illegal value) */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("CGEDMD", &i__1); + return 0; + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of Y. */ + if (rwork[i__] > zero) { + r__1 = one / rwork[i__]; + csscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1); +/* Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (rwork[i__] < zero) { + r__1 = -rwork[i__]; + r__2 = one / (real) (*m); + clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * + y_dim1 + 1], ldy, &info2); +/* LAPACK C */ + } else if (c_abs(&y[icamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ + * y_dim1]) != zero) { +/* X(:,i) is zero vector. For consistency, */ +/* Y(:,i) should also be zero. If Y(:,i) is not */ +/* zero, then the data might be inconsistent or */ +/* corrupted. If JOBS == 'C', Y(:,i) is set to */ +/* zero and a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + if (lsame_(jobs, "C")) { + csscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1); + } +/* BLAS CALL */ + } + } + } + + if (sccoly) { +/* The columns of Y will be normalized. */ +/* To prevent overflows, the column norms of Y are */ +/* carefully computed using CLASSQ. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* RWORK(i) = SCNRM2( M, Y(1,i), 1 ) */ + scale = zero; + classq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum); + if (sisnan_(&scale) || sisnan_(&ssum)) { + *k = 0; + *info = -10; + i__2 = -(*info); + xerbla_("CGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of Y(:,i) overflows. First, Y(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */ +/* Next, the norm of Y(:,i) is stored without */ +/* overflow as RWORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of Y(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + r__1 = one / rootsc; + clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[ + i__ * y_dim1 + 1], ldy, &info2); + rwork[i__] = -scale * (rootsc / (real) (*m)); + } else { +/* Y(:,i) will be scaled to unit 2-norm */ + rwork[i__] = scale * rootsc; + clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, & + y[i__ * y_dim1 + 1], ldy, &info2); +/* Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* LAPA */ + } + } else { + rwork[i__] = zero; + } + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of X. */ + if (rwork[i__] > zero) { + r__1 = one / rwork[i__]; + csscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1); +/* X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (rwork[i__] < zero) { + r__1 = -rwork[i__]; + r__2 = one / (real) (*m); + clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * + x_dim1 + 1], ldx, &info2); +/* LAPACK */ + } else if (c_abs(&x[icamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ + * x_dim1]) != zero) { +/* Y(:,i) is zero vector. If X(:,i) is not */ +/* zero, then a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + } + } + } + +/* <2> SVD of the data snapshot matrix X. */ +/* ===================================== */ +/* The left singular vectors are stored in the array X. */ +/* The right singular vectors are in the array W. */ +/* The array W will later on contain the eigenvectors */ +/* of a Rayleigh quotient. */ + numrnk = *n; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { + cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], + ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], & + info1); +/* LA */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 2) { + cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, & + w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1] + , &info1); +/* LAP */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 3) { + i__1 = *lrwork - *n; + cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], + &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], + liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1); +/* LAPACK CA */ + clacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK C */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 4) { + i__1 = *lrwork - *n; + cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, & + rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], + lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1); + clacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK CALL */ + *(unsigned char *)t_or_n__ = 'N'; + xscl1 = rwork[*n + 1]; + xscl2 = rwork[*n + 2]; + if (xscl1 != xscl2) { +/* This is an exceptional situation. If the */ +/* data matrices are not scaled and the */ +/* largest singular value of X overflows. */ +/* In that case CGEJSV can return the SVD */ +/* in scaled form. The scaling factor can be used */ +/* to rescale the data (X and Y). */ + clascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], + ldy, &info2); + } +/* END SELECT */ + } + + if (info1 > 0) { +/* The SVD selected subroutine did not converge. */ +/* Return with an error code. */ + *info = 2; + return 0; + } + + if (rwork[1] == zero) { +/* The largest computed singular value of (scaled) */ +/* X is zero. Return error code -8 */ +/* (the 8th input variable had an illegal value). */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("CGEDMD", &i__1); + return 0; + } + +/* <3> Determine the numerical rank of the data */ +/* snapshots matrix X. This depends on the */ +/* parameters NRNK and TOL. */ +/* SELECT CASE ( NRNK ) */ + if (*nrnk == -1) { + *k = 1; + i__1 = numrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else if (*nrnk == -2) { + *k = 1; + i__1 = numrnk - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else { + *k = 1; + i__1 = *nrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } +/* END SELECT */ + } +/* Now, U = X(1:M,1:K) is the SVD/POD basis for the */ +/* snapshot data in the input matrix X. */ +/* <4> Compute the Rayleigh quotient S = U^H * A * U. */ +/* Depending on the requested outputs, the computation */ +/* is organized to compute additional auxiliary */ +/* matrices (for the residuals and refinements). */ + +/* In all formulas below, we need V_k*Sigma_k^(-1) */ +/* where either V_k is in W(1:N,1:K), or V_k^H is in */ +/* W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */ + if (lsame_(t_or_n__, "N")) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + r__1 = one / rwork[i__]; + csscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1); +/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i) ! INTRINSIC */ +/* BLAS CALL */ + } + } else { +/* This non-unit stride access is due to the fact */ +/* that CGESVD, CGESVDQ and CGESDD return the */ +/* adjoint matrix of the right singular vectors. */ +/* DO i = 1, K */ +/* CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW ) ! BLAS CALL */ +/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N) ! INTRINSIC */ +/* END DO */ + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + rwork[*n + i__] = one / rwork[i__]; + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *k; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * w_dim1; + i__4 = *n + i__; + q__2.r = rwork[i__4], q__2.i = zero; + i__5 = i__ + j * w_dim1; + q__1.r = q__2.r * w[i__5].r - q__2.i * w[i__5].i, q__1.i = + q__2.r * w[i__5].i + q__2.i * w[i__5].r; + w[i__3].r = q__1.r, w[i__3].i = q__1.i; + } + } + } + + if (wntref) { + +/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */ +/* for computing the refined Ritz vectors */ +/* (optionally, outside CGEDMD). */ + cgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset], + ldw, &zzero, &z__[z_offset], ldz); +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRI */ +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRI */ + +/* At this point Z contains */ +/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */ +/* this is needed for computing the residuals. */ +/* This matrix is returned in the array B and */ +/* it can be used to compute refined Ritz vectors. */ +/* BLAS */ + clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); +/* B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC */ +/* BLAS CALL */ + cgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], + ldz, &zzero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */ +/* At this point S = U^H * A * U is the Rayleigh quotient. */ +/* BLAS */ + } else { +/* A * U(:,1:K) is not explicitly needed and the */ +/* computation is organized differently. The Rayleigh */ +/* quotient is computed more efficiently. */ + cgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy, + &zzero, &z__[z_offset], ldz); +/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! IN */ + +/* B */ + cgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */ +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRIN */ +/* At this point S = U^H * A * U is the Rayleigh quotient. */ +/* If the residuals are requested, save scaled V_k into Z. */ +/* Recall that V_k or V_k^H is stored in W. */ +/* BLAS */ + if (wntres || wntex) { + if (lsame_(t_or_n__, "N")) { + clacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz); + } else { + clacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz); + } + } + } + +/* <5> Compute the Ritz values and (if requested) the */ +/* right eigenvectors of the Rayleigh quotient. */ + + cgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[ + w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1); + +/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */ +/* quotient. See the description of Z. */ +/* Also, see the description of CGEEV. */ +/* LAPACK CA */ + if (info1 > 0) { +/* CGEEV failed to compute the eigenvalues and */ +/* eigenvectors of the Rayleigh quotient. */ + *info = 3; + return 0; + } + +/* <6> Compute the eigenvectors (if requested) and, */ +/* the residuals (if requested). */ + + if (wntvec || wntex) { + if (wntres) { + if (wntref) { +/* Here, if the refinement is requested, we have */ +/* A*U(:,1:K) already computed and stored in Z. */ +/* For the residuals, need Y = A * U(:,1;K) * W. */ + cgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &y[y_offset], ldy); +/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC */ +/* This frees Z; Y contains A * U(:,1:K) * W. */ +/* BLAS CALL */ + } else { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */ + cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[ + s_offset], lds, &zzero, &z__[z_offset], ldz); +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ + clacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy); + if (wntex) { + clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); + } + } + } else if (wntex) { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], + lds, &zzero, &b[b_offset], ldb); +/* The above call replaces the following two calls */ +/* that were used in the developing-testing phase. */ +/* CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */ +/* LDS, ZZERO, Z, LDZ) */ +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ +/* CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) */ + } + +/* Compute the Ritz vectors */ + if (wntvec) { + cgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], + ldw, &zzero, &z__[z_offset], ldz); + } +/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRIN */ + +/* BLAS CALL */ + if (wntres) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + q__1.r = -eigs[i__2].r, q__1.i = -eigs[i__2].i; + caxpy_(m, &q__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * + y_dim1 + 1], &c__1); +/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i) ! */ + + res[i__] = scnrm2_(m, &y[i__ * y_dim1 + 1], &c__1); + + } + } + } + + if (*whtsvd == 4) { + rwork[*n + 1] = xscl1; + rwork[*n + 2] = xscl2; + } + +/* Successful exit. */ + if (! badxy) { + *info = 0; + } else { +/* A warning on possible data inconsistency. */ +/* This should be a rare event. */ + *info = 4; + } +/* ............................................................ */ + return 0; +/* ...... */ +} /* cgedmd_ */ + diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90 new file mode 100644 index 000000000..1413130ec --- /dev/null +++ b/lapack-netlib/SRC/cgedmd.f90 @@ -0,0 +1,1151 @@ +!> \brief \b CGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!..... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 +! +!..... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> CGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, CGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, CGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... +! Arguments +! ========= +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim +!..... +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim +!..... +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim +!..... +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim +!..... +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: CGESVD (the QR SVD algorithm) +!> 2 :: CGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim +!..... +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim +!..... +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim +!..... +!> \param[in,out] X +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim +!..... +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim +!..... +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim +!..... +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim +!..... +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim +!..... +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim +!..... +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim +!..... +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim +!..... +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim +!..... +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim +!..... +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim +!..... +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim +!..... +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim +!..... +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim +!..... +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim +!..... +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by CGEEV. +!> See the description of K. +!> \endverbatim +!..... +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim +!..... +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to CGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim +!..... +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), +!> where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: CGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: CGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim +!..... +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to CGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim +!..... +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where +!> LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: CGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: CGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim +!..... +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim +!..... +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim +!..... +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! +!............................................................. +!............................................................. + SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, EIGS, Z, LDZ, RES, B, LDB, & + W, LDW, S, LDS, ZWORK, LZWORK, & + RWORK, LRWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LIWORK, LRWORK, LZWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: RWORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +! +! Local scalars +! ~~~~~~~~~~~~~ + REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & + SSUM, XSCL1, XSCL2 + INTEGER :: i, j, IMINWR, INFO1, INFO2, & + LWRKEV, LWRSDD, LWRSVD, LWRSVJ, & + LWRSVQ, MLWORK, MWRKEV, MWRSDD, & + MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, & + OLWORK, MLRWRK + LOGICAL :: BADXY, LQUERY, SCCOLX, SCCOLY, & + WNTEX, WNTREF, WNTRES, WNTVEC + CHARACTER :: JOBZL, T_OR_N + CHARACTER :: JSVOPT +! +! Local arrays +! ~~~~~~~~~~~~ + REAL(KIND=WP) :: RDUMMY(2) +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2 + EXTERNAL CLANGE, SLAMCH, SCNRM2, ICAMAX + INTEGER ICAMAX + LOGICAL SISNAN, LSAME + EXTERNAL SISNAN, LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL CAXPY, CGEMM, CSSCAL + EXTERNAL CGEEV, CGEJSV, CGESDD, CGESVD, CGESVDQ, & + CLACPY, CLASCL, CLASSQ, XERBLA +! +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC FLOAT, INT, MAX, SQRT +!............................................................ +! +! Test the input arguments +! + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C') + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + INFO = 0 + LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) & + .OR. ( LRWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N') & + .OR. LSAME(JOBZ,'F')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. (.NOT.WNTVEC) ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -4 + ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR. & + (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN + INFO = -5 + ELSE IF ( M < 0 ) THEN + INFO = -6 + ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN + INFO = -7 + ELSE IF ( LDX < M ) THEN + INFO = -9 + ELSE IF ( LDY < M ) THEN + INFO = -11 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -12 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -13 + ELSE IF ( LDZ < M ) THEN + INFO = -17 + ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN + INFO = -20 + ELSE IF ( LDW < N ) THEN + INFO = -22 + ELSE IF ( LDS < N ) THEN + INFO = -24 + END IF +! + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( N == 0 ) THEN + ! Quick return. All output except K is void. + ! INFO=1 signals the void input. + ! In case of a workspace query, the default + ! minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + RWORK(1) = 1 + ZWORK(1) = 2 + ZWORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + + IMINWR = 1 + MLRWRK = MAX(1,N) + MLWORK = 2 + OLWORK = 2 + SELECT CASE ( WHTSVD ) + CASE (1) + ! The following is specified as the minimal + ! length of WORK in the definition of CGESVD: + ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) + MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) + MLWORK = MAX(MLWORK,MWRSVD) + MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N)) + IF ( LQUERY ) THEN + CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, & + B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 ) + LWRSVD = INT( ZWORK(1) ) + OLWORK = MAX(OLWORK,LWRSVD) + END IF + CASE (2) + ! The following is specified as the minimal + ! length of WORK in the definition of CGESDD: + ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N). + ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) + ! In LAPACK 3.10.1 RWORK is defined differently. + ! Below we take max over the two versions. + ! IMINWR = 8*MIN(M,N) + MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) + MLWORK = MAX(MLWORK,MWRSDD) + IMINWR = 8*MIN(M,N) + MLRWRK = MAX( MLRWRK, N + & + MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), & + 5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), & + 2*MAX(M,N)*MIN(M,N)+ & + 2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) + IF ( LQUERY ) THEN + CALL CGESDD( 'O', M, N, X, LDX, RWORK, B, & + LDB, W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 ) + LWRSDD = MAX(MWRSDD,INT( ZWORK(1) )) + OLWORK = MAX(OLWORK,LWRSDD) + END IF + CASE (3) + CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK, & + IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 ) + IMINWR = IWORK(1) + MWRSVQ = INT(ZWORK(2)) + MLWORK = MAX(MLWORK,MWRSVQ) + MLRWRK = MAX(MLRWRK,N + INT(RDUMMY(1))) + IF ( LQUERY ) THEN + LWRSVQ = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,LWRSVQ) + END IF + CASE (4) + JSVOPT = 'J' + CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, & + N, X, LDX, RWORK, Z, LDZ, W, LDW, & + ZWORK, -1, RDUMMY, -1, IWORK, INFO1 ) + IMINWR = IWORK(1) + MWRSVJ = INT(ZWORK(2)) + MLWORK = MAX(MLWORK,MWRSVJ) + MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1)))) + IF ( LQUERY ) THEN + LWRSVJ = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,LWRSVJ) + END IF + END SELECT + IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN + JOBZL = 'V' + ELSE + JOBZL = 'N' + END IF + ! Workspace calculation to the CGEEV call + MWRKEV = MAX( 1, 2*N ) + MLWORK = MAX(MLWORK,MWRKEV) + MLRWRK = MAX(MLRWRK,N+2*N) + IF ( LQUERY ) THEN + CALL CGEEV( 'N', JOBZL, N, S, LDS, EIGS, & + W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 ) ! LAPACK CALL + LWRKEV = INT(ZWORK(1)) + OLWORK = MAX( OLWORK, LWRKEV ) + OLWORK = MAX( 2, OLWORK ) + END IF +! + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30 + IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28 + IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26 + + END IF +! + IF( INFO /= 0 ) THEN + CALL XERBLA( 'CGEDMD', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + RWORK(1) = MLRWRK + ZWORK(1) = MLWORK + ZWORK(2) = OLWORK + RETURN + END IF +!............................................................ +! + OFL = SLAMCH('O')*SLAMCH('P') + SMALL = SLAMCH('S') + BADXY = .FALSE. +! +! <1> Optional scaling of the snapshots (columns of X, Y) +! ========================================================== + IF ( SCCOLX ) THEN + ! The columns of X will be normalized. + ! To prevent overflows, the column norms of X are + ! carefully computed using CLASSQ. + K = 0 + DO i = 1, N + !WORK(i) = SCNRM2( M, X(1,i), 1 ) + SSUM = ONE + SCALE = ZERO + CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM ) + IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN + K = 0 + INFO = -8 + CALL XERBLA('CGEDMD',-INFO) + END IF + IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of X(:,i) overflows. First, X(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. +! Next, the norm of X(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of X(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, X(1,i), LDX, INFO2 ) + RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + RWORK(i) = SCALE * ROOTSC + CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, & + X(1,i), LDX, INFO2 ) ! LAPACK CALL +! X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC + END IF + ELSE + RWORK(i) = ZERO + K = K + 1 + END IF + END DO + IF ( K == N ) THEN + ! All columns of X are zero. Return error code -8. + ! (the 8th input variable had an illegal value) + K = 0 + INFO = -8 + CALL XERBLA('CGEDMD',-INFO) + RETURN + END IF + DO i = 1, N +! Now, apply the same scaling to the columns of Y. + IF ( RWORK(i) > ZERO ) THEN + CALL CSSCAL( M, ONE/RWORK(i), Y(1,i), 1 ) ! BLAS CALL +! Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC + ELSE IF ( RWORK(i) < ZERO ) THEN + CALL CLASCL( 'G', 0, 0, -RWORK(i), & + ONE/FLOAT(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL + ELSE IF ( ABS(Y(ICAMAX(M, Y(1,i),1),i )) & + /= ZERO ) THEN +! X(:,i) is zero vector. For consistency, +! Y(:,i) should also be zero. If Y(:,i) is not +! zero, then the data might be inconsistent or +! corrupted. If JOBS == 'C', Y(:,i) is set to +! zero and a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + IF ( LSAME(JOBS,'C')) & + CALL CSSCAL( M, ZERO, Y(1,i), 1 ) ! BLAS CALL + END IF + END DO + END IF + ! + IF ( SCCOLY ) THEN + ! The columns of Y will be normalized. + ! To prevent overflows, the column norms of Y are + ! carefully computed using CLASSQ. + DO i = 1, N + !RWORK(i) = SCNRM2( M, Y(1,i), 1 ) + SSUM = ONE + SCALE = ZERO + CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM ) + IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN + K = 0 + INFO = -10 + CALL XERBLA('CGEDMD',-INFO) + END IF + IF ( SCALE /= ZERO .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of Y(:,i) overflows. First, Y(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. +! Next, the norm of Y(:,i) is stored without +! overflow as RWORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of Y(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, Y(1,i), LDY, INFO2 ) + RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) ) + ELSE +! Y(:,i) will be scaled to unit 2-norm + RWORK(i) = SCALE * ROOTSC + CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, & + Y(1,i), LDY, INFO2 ) ! LAPACK CALL +! Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC + END IF + ELSE + RWORK(i) = ZERO + END IF + END DO + DO i = 1, N +! Now, apply the same scaling to the columns of X. + IF ( RWORK(i) > ZERO ) THEN + CALL CSSCAL( M, ONE/RWORK(i), X(1,i), 1 ) ! BLAS CALL +! X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC + ELSE IF ( RWORK(i) < ZERO ) THEN + CALL CLASCL( 'G', 0, 0, -RWORK(i), & + ONE/FLOAT(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL + ELSE IF ( ABS(X(ICAMAX(M, X(1,i),1),i )) & + /= ZERO ) THEN +! Y(:,i) is zero vector. If X(:,i) is not +! zero, then a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + END IF + END DO + END IF +! +! <2> SVD of the data snapshot matrix X. +! ===================================== +! The left singular vectors are stored in the array X. +! The right singular vectors are in the array W. +! The array W will later on contain the eigenvectors +! of a Rayleigh quotient. + NUMRNK = N + SELECT CASE ( WHTSVD ) + CASE (1) + CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, & + LDB, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 ) ! LAPACK CALL + T_OR_N = 'C' + CASE (2) + CALL CGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, & + LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 ) ! LAPACK CALL + T_OR_N = 'C' + CASE (3) + CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, RWORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, ZWORK, & + LZWORK, RWORK(N+1), LRWORK-N, INFO1) ! LAPACK CALL + CALL CLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'C' + CASE (4) + CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, & + N, X, LDX, RWORK, Z, LDZ, W, LDW, & + ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 ) ! LAPACK CALL + CALL CLACPY( 'A', M, N, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'N' + XSCL1 = RWORK(N+1) + XSCL2 = RWORK(N+2) + IF ( XSCL1 /= XSCL2 ) THEN + ! This is an exceptional situation. If the + ! data matrices are not scaled and the + ! largest singular value of X overflows. + ! In that case CGEJSV can return the SVD + ! in scaled form. The scaling factor can be used + ! to rescale the data (X and Y). + CALL CLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2 ) + END IF + END SELECT +! + IF ( INFO1 > 0 ) THEN + ! The SVD selected subroutine did not converge. + ! Return with an error code. + INFO = 2 + RETURN + END IF +! + IF ( RWORK(1) == ZERO ) THEN + ! The largest computed singular value of (scaled) + ! X is zero. Return error code -8 + ! (the 8th input variable had an illegal value). + K = 0 + INFO = -8 + CALL XERBLA('CGEDMD',-INFO) + RETURN + END IF +! + !<3> Determine the numerical rank of the data + ! snapshots matrix X. This depends on the + ! parameters NRNK and TOL. + + SELECT CASE ( NRNK ) + CASE ( -1 ) + K = 1 + DO i = 2, NUMRNK + IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. & + ( RWORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE ( -2 ) + K = 1 + DO i = 1, NUMRNK-1 + IF ( ( RWORK(i+1) <= RWORK(i)*TOL ) .OR. & + ( RWORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE DEFAULT + K = 1 + DO i = 2, NRNK + IF ( RWORK(i) <= SMALL ) EXIT + K = K + 1 + END DO + END SELECT + ! Now, U = X(1:M,1:K) is the SVD/POD basis for the + ! snapshot data in the input matrix X. + + !<4> Compute the Rayleigh quotient S = U^H * A * U. + ! Depending on the requested outputs, the computation + ! is organized to compute additional auxiliary + ! matrices (for the residuals and refinements). + ! + ! In all formulas below, we need V_k*Sigma_k^(-1) + ! where either V_k is in W(1:N,1:K), or V_k^H is in + ! W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). + IF ( LSAME(T_OR_N, 'N') ) THEN + DO i = 1, K + CALL CSSCAL( N, ONE/RWORK(i), W(1,i), 1 ) ! BLAS CALL + ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i) ! INTRINSIC + END DO + ELSE + ! This non-unit stride access is due to the fact + ! that CGESVD, CGESVDQ and CGESDD return the + ! adjoint matrix of the right singular vectors. + !DO i = 1, K + ! CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW ) ! BLAS CALL + ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N) ! INTRINSIC + !END DO + DO i = 1, K + RWORK(N+i) = ONE/RWORK(i) + END DO + DO j = 1, N + DO i = 1, K + W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j) + END DO + END DO + END IF +! + IF ( WNTREF ) THEN + ! + ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) + ! for computing the refined Ritz vectors + ! (optionally, outside CGEDMD). + CALL CGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, & + LDW, ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRINSIC, for T_OR_N=='N' + ! + ! At this point Z contains + ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and + ! this is needed for computing the residuals. + ! This matrix is returned in the array B and + ! it can be used to compute refined Ritz vectors. + CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) ! BLAS CALL + ! B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC + + CALL CGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, & + LDZ, ZZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC + ! At this point S = U^H * A * U is the Rayleigh quotient. + ELSE + ! A * U(:,1:K) is not explicitly needed and the + ! computation is organized differently. The Rayleigh + ! quotient is computed more efficiently. + CALL CGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, & + ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! INTRINSIC + ! + CALL CGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, & + LDW, ZZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRINSIC, for T_OR_N=='N' + ! At this point S = U^H * A * U is the Rayleigh quotient. + ! If the residuals are requested, save scaled V_k into Z. + ! Recall that V_k or V_k^H is stored in W. + IF ( WNTRES .OR. WNTEX ) THEN + IF ( LSAME(T_OR_N, 'N') ) THEN + CALL CLACPY( 'A', N, K, W, LDW, Z, LDZ ) + ELSE + CALL CLACPY( 'A', K, N, W, LDW, Z, LDZ ) + END IF + END IF + END IF +! + !<5> Compute the Ritz values and (if requested) the + ! right eigenvectors of the Rayleigh quotient. + ! + CALL CGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, & + LDW, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 ) ! LAPACK CALL + ! + ! W(1:K,1:K) contains the eigenvectors of the Rayleigh + ! quotient. See the description of Z. + ! Also, see the description of CGEEV. + IF ( INFO1 > 0 ) THEN + ! CGEEV failed to compute the eigenvalues and + ! eigenvectors of the Rayleigh quotient. + INFO = 3 + RETURN + END IF +! + ! <6> Compute the eigenvectors (if requested) and, + ! the residuals (if requested). + ! + IF ( WNTVEC .OR. WNTEX ) THEN + IF ( WNTRES ) THEN + IF ( WNTREF ) THEN + ! Here, if the refinement is requested, we have + ! A*U(:,1:K) already computed and stored in Z. + ! For the residuals, need Y = A * U(:,1;K) * W. + CALL CGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, & + LDW, ZZERO, Y, LDY ) ! BLAS CALL + ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC + ! This frees Z; Y contains A * U(:,1:K) * W. + ELSE + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z + CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, & + W, LDW, ZZERO, S, LDS) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + LDS, ZZERO, Z, LDZ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + CALL CLACPY( 'A', M, K, Z, LDZ, Y, LDY ) + IF ( WNTEX ) CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF + ELSE IF ( WNTEX ) THEN + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, & + W, LDW, ZZERO, S, LDS) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + LDS, ZZERO, B, LDB) + ! The above call replaces the following two calls + ! that were used in the developing-testing phase. + ! CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + ! LDS, ZZERO, Z, LDZ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + ! CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF +! + ! Compute the Ritz vectors + IF ( WNTVEC ) CALL CGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, & + ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC +! + IF ( WNTRES ) THEN + DO i = 1, K + CALL CAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 ) ! BLAS CALL + ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i) ! INTRINSIC + RES(i) = SCNRM2( M, Y(1,i), 1) ! BLAS CALL + END DO + END IF + END IF +! + IF ( WHTSVD == 4 ) THEN + RWORK(N+1) = XSCL1 + RWORK(N+2) = XSCL2 + END IF +! +! Successful exit. + IF ( .NOT. BADXY ) THEN + INFO = 0 + ELSE + ! A warning on possible data inconsistency. + ! This should be a rare event. + INFO = 4 + END IF +!............................................................ + RETURN +! ...... + END SUBROUTINE CGEDMD + diff --git a/lapack-netlib/SRC/cgedmdq.c b/lapack-netlib/SRC/cgedmdq.c new file mode 100644 index 000000000..6e3a1faca --- /dev/null +++ b/lapack-netlib/SRC/cgedmdq.c @@ -0,0 +1,1289 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0 */ +/* The state space dimension (the number of rows of F). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshots from a single trajectory, */ +/* taken at equidistant discrete times. This is the */ +/* number of columns of F. */ +/* ..... */ +/* F (input/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, */ +/* the columns of F are the sequence of data snapshots */ +/* from a single trajectory, taken at equidistant discrete */ +/* times. It is assumed that the column norms of F are */ +/* in the range of the normalized floating point numbers. */ +/* < On exit, */ +/* If JOBQ == 'Q', the array F contains the orthogonal */ +/* matrix/factor of the QR factorization of the initial */ +/* data snapshots matrix F. See the description of JOBQ. */ +/* If JOBQ == 'N', the entries in F strictly below the main */ +/* diagonal contain, column-wise, the information on the */ +/* Householder vectors, as returned by CGEQRF. The */ +/* remaining information to restore the orthogonal matrix */ +/* of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */ +/* See the description of ZWORK. */ +/* ..... */ +/* LDF (input) INTEGER, LDF >= M */ +/* The leading dimension of the array F. */ +/* ..... */ +/* X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* X is used as workspace to hold representations of the */ +/* leading N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, the leading K columns of X contain the leading */ +/* K left singular vectors of the above described content */ +/* of X. To lift them to the space of the left singular */ +/* vectors U(:,1:K) of the input data, pre-multiply with the */ +/* Q factor from the initial QR factorization. */ +/* See the descriptions of F, K, V and Z. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= N */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */ +/* Y is used as workspace to hold representations of the */ +/* trailing N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, */ +/* If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */ +/* triangular factor from the QR factorization of the data */ +/* snapshot matrix F. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= N */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N-1 :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the description of K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the SVD/POD basis for the leading N-1 */ +/* data snapshots (columns of F) and the number of the */ +/* computed Ritz pairs. The value of K is determined */ +/* according to the rule set by the parameters NRNK and */ +/* TOL. See the descriptions of NRNK and TOL. */ +/* ..... */ +/* EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K<=N-1) entries of EIGS contain */ +/* the computed eigenvalues (Ritz values). */ +/* See the descriptions of K, and Z. */ +/* ..... */ +/* Z (workspace/output) COMPLEX(KIND=WP) M-by-(N-1) array */ +/* If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */ +/* is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */ +/* Z*V, where Z contains orthonormal matrix (the product of */ +/* Q from the initial QR factorization and the SVD/POD_basis */ +/* returned by CGEDMD in X) and the second factor (the */ +/* eigenvectors of the Rayleigh quotient) is in the array V, */ +/* as returned by CGEDMD. That is, X(:,1:K)*V(:,i) */ +/* is an eigenvector corresponding to EIGS(i). The columns */ +/* of V(1:K,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. */ +/* See the descriptions of EIGS, X and V. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs, */ +/* RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */ +/* See the description of EIGS and Z. */ +/* ..... */ +/* B (output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array. */ +/* IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:N,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* In both cases, the content of B can be lifted to the */ +/* original dimension of the input data by pre-multiplying */ +/* with the Q factor from the initial QR factorization. */ +/* Here A denotes a compression of the underlying operator. */ +/* See the descriptions of F and X. */ +/* If JOBF =='N', then B is not referenced. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= MIN(M,N) */ +/* The leading dimension of the array B. */ +/* ..... */ +/* V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */ +/* On exit, V(1:K,1:K) V contains the K eigenvectors of */ +/* the Rayleigh quotient. The Ritz vectors */ +/* (returned in Z) are the product of Q from the initial QR */ +/* factorization (see the description of F) X (see the */ +/* description of X) and V. */ +/* ..... */ +/* LDV (input) INTEGER, LDV >= N-1 */ +/* The leading dimension of the array V. */ +/* ..... */ +/* S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by CGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N-1 */ +/* The leading dimension of the array S. */ +/* ..... */ +/* ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* ZWORK(1:MIN(M,N)) contains the scalar factors of the */ +/* elementary reflectors as returned by CGEQRF of the */ +/* M-by-N input matrix F. */ +/* If the call to CGEDMDQ is only workspace query, then */ +/* ZWORK(1) contains the minimal complex workspace length and */ +/* ZWORK(2) is the optimal complex workspace length. */ +/* Hence, the length of work is at least 2. */ +/* See the description of LZWORK. */ +/* ..... */ +/* LZWORK (input) INTEGER */ +/* The minimal length of the workspace vector ZWORK. */ +/* LZWORK is calculated as follows: */ +/* Let MLWQR = N (minimal workspace for CGEQRF[M,N]) */ +/* MLWDMD = minimal workspace for CGEDMD (see the */ +/* description of LWORK in CGEDMD) */ +/* MLWMQR = N (minimal workspace for */ +/* ZUNMQR['L','N',M,N,N]) */ +/* MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */ +/* MINMN = MIN(M,N) */ +/* Then */ +/* LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */ +/* is further updated as follows: */ +/* if JOBZ == 'V' or JOBZ == 'F' THEN */ +/* LZWORK = MAX( LZWORK, MINMN+MLWMQR ) */ +/* if JOBQ == 'Q' THEN */ +/* LZWORK = MAX( ZLWORK, MINMN+MLWGQR) */ + +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* WORK(1:N-1) contains the singular values of */ +/* the input submatrix F(1:M,1:N-1). */ +/* If the call to CGEDMDQ is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is the same as in CGEDMD, because in CGEDMDQ */ +/* only CGEDMD requires real workspace for snapshots */ +/* of dimensions MIN(M,N)-by-(N-1). */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* Let M1=MIN(M,N), N1=N-1. Then */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) */ + +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ + +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* External subroutines */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* .......................................................... */ + /* Parameter adjustments */ + f_dim1 = *ldf; + f_offset = 1 + f_dim1 * 1; + f -= f_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --eigs; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + v_dim1 = *ldv; + v_offset = 1 + v_dim1 * 1; + v -= v_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --zwork; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; + zzero.r = 0.f, zzero.i = 0.f; + +/* Test the input arguments */ + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntvcf = lsame_(jobz, "F"); + wntvcq = lsame_(jobz, "Q"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + wantq = lsame_(jobq, "Q"); + wnttrf = lsame_(jobt, "R"); + minmn = f2cmin(*m,*n); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && lsame_(jobz, "N")) { + *info = -3; + } else if (! (wantq || lsame_(jobq, "N"))) { + *info = -4; + } else if (! (wnttrf || lsame_(jobt, "N"))) { + *info = -5; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -6; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -7; + } else if (*m < 0) { + *info = -8; + } else if (*n < 0 || *n > *m + 1) { + *info = -9; + } else if (*ldf < *m) { + *info = -11; + } else if (*ldx < minmn) { + *info = -13; + } else if (*ldy < minmn) { + *info = -15; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -16; + } else if (*tol < zero || *tol >= one) { + *info = -17; + } else if (*ldz < *m) { + *info = -21; + } else if ((wntref || wntex) && *ldb < minmn) { + *info = -24; + } else if (*ldv < *n - 1) { + *info = -26; + } else if (*lds < *n - 1) { + *info = -28; + } + + if (wntvec || wntvcf || wntvcq) { + *(unsigned char *)jobvl = 'V'; + } else { + *(unsigned char *)jobvl = 'N'; + } + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0 || *n == 1) { +/* All output except K is void. INFO=1 signals */ +/* the void input. In case of a workspace query, */ +/* the minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.f; + work[2] = 2.f; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlrwrk = 2; + mlwork = 2; + olwork = 2; + iminwr = 1; + mlwqr = f2cmax(1,*n); +/* Minimal workspace length for CGEQRF. */ +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, & + info1); + olwqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwqr; + olwork = f2cmax(i__1,i__2); + } + i__1 = *n - 1; + cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], + ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset] + , ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[ + s_offset], lds, &zwork[1], lzwork, &work[1], &c_n1, &iwork[1], + liwork, &info1); + mlwdmd = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwdmd; + mlwork = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = (integer) work[1]; + mlrwrk = f2cmax(i__1,i__2); + iminwr = f2cmax(iminwr,iwork[1]); + if (lquery) { + olwdmd = (integer) zwork[2].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwdmd; + olwork = f2cmax(i__1,i__2); + } + if (wntvec || wntvcf) { + mlwmqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwmqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + cunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], + &z__[z_offset], ldz, &zwork[1], &c_n1, &info1); + olwmqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwmqr; + olwork = f2cmax(i__1,i__2); + } + } + if (wantq) { + mlwgqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwgqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], & + zwork[1], &c_n1, &info1); + olwgqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwgqr; + olwork = f2cmax(i__1,i__2); + } + } + if (*liwork < iminwr && ! lquery) { + *info = -34; + } + if (*lwork < mlrwrk && ! lquery) { + *info = -32; + } + if (*lzwork < mlwork && ! lquery) { + *info = -30; + } + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGEDMDQ", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + zwork[1].r = (real) mlwork, zwork[1].i = 0.f; + zwork[2].r = (real) olwork, zwork[2].i = 0.f; + work[1] = (real) mlrwrk; + work[2] = (real) mlrwrk; + return 0; + } +/* ..... */ +/* Initial QR factorization that is used to represent the */ +/* snapshots as elements of lower dimensional subspace. */ +/* For large scale computation with M >>N , at this place */ +/* one can use an out of core QRF. */ + + i__1 = *lzwork - minmn; + cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, & + info1); + +/* Define X and Y as the snapshots representations in the */ +/* orthogonal basis computed in the QR factorization. */ +/* X corresponds to the leading N-1 and Y to the trailing */ +/* N-1 snapshots. */ + i__1 = *n - 1; + claset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx); + i__1 = *n - 1; + clacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx); + i__1 = *n - 1; + clacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy); + if (*m >= 3) { + i__1 = minmn - 2; + i__2 = *n - 2; + claset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy); + } + +/* Compute the DMD of the projected snapshot pairs (X,Y) */ + i__1 = *n - 1; + i__2 = *lzwork - minmn; + cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx, + &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, & + res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, & + zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, & + info1); + if (info1 == 2 || info1 == 3) { +/* Return with error code. See CGEDMD for details. */ + *info = info1; + return 0; + } else { + *info = info1; + } + +/* The Ritz vectors (Koopman modes) can be explicitly */ +/* formed or returned in factored form. */ + if (wntvec) { +/* Compute the eigenvectors explicitly. */ + if (*m > minmn) { + i__1 = *m - minmn; + claset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], + ldz); + } + i__1 = *lzwork - minmn; + cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[ + z_offset], ldz, &zwork[minmn + 1], &i__1, &info1); + } else if (wntvcf) { +/* Return the Ritz vectors (eigenvectors) in factored */ +/* form Z*V, where Z contains orthonormal matrix (the */ +/* product of Q from the initial QR factorization and */ +/* the SVD/POD_basis returned by CGEDMD in X) and the */ +/* second factor (the eigenvectors of the Rayleigh */ +/* quotient) is in the array V, as returned by CGEDMD. */ + clacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz); + if (*m > *n) { + i__1 = *m - *n; + claset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz); + } + i__1 = *lzwork - minmn; + cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[ + z_offset], ldz, &zwork[minmn + 1], &i__1, &info1); + } + +/* Some optional output variables: */ + +/* The upper triangular factor R in the initial QR */ +/* factorization is optionally returned in the array Y. */ +/* This is useful if this call to CGEDMDQ is to be */ +/* followed by a streaming DMD that is implemented in a */ +/* QR compressed form. */ + if (wnttrf) { +/* Return the upper triangular R in Y */ + claset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy); + clacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy); + } + +/* The orthonormal/unitary factor Q in the initial QR */ +/* factorization is optionally returned in the array F. */ +/* Same as with the triangular factor above, this is */ +/* useful in a streaming DMD. */ + if (wantq) { +/* Q overwrites F */ + i__1 = *lzwork - minmn; + cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn + + 1], &i__1, &info1); + } + + return 0; + +} /* cgedmdq_ */ + diff --git a/lapack-netlib/SRC/cgedmdq.f90 b/lapack-netlib/SRC/cgedmdq.f90 new file mode 100644 index 000000000..52c1669c7 --- /dev/null +++ b/lapack-netlib/SRC/cgedmdq.f90 @@ -0,0 +1,689 @@ +SUBROUTINE CGEDMDQ( JOBS, JOBZ, JOBR, JOBQ, JOBT, JOBF, & + WHTSVD, M, N, F, LDF, X, LDX, Y, & + LDY, NRNK, TOL, K, EIGS, & + Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK, LZWORK, WORK, LWORK, & + IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBQ, & + JOBT, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDF, LDX, & + LDY, NRNK, LDZ, LDB, LDV, & + LDS, LZWORK, LWORK, LIWORK + INTEGER, INTENT(OUT) :: INFO, K + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*) + COMPLEX(KIND=WP), INTENT(OUT) :: X(LDX,*), Y(LDY,*), & + Z(LDZ,*), B(LDB,*), & + V(LDV,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!..... +! Purpose +! ======= +! CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices, using a QR factorization +! based compression of the data. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, CGEDMDQ computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, CGEDMDQ returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office. +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. The data snapshots are the columns +! of F. The leading N-1 columns of F are denoted X and the +! trailing N-1 columns are denoted Y. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Z*V, where Z +! is orthonormal and V contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of F, V, Z. +! 'Q' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Q*Z, where Z +! contains the eigenvectors of the compression of the +! underlying discretised operator onto the span of +! the data snapshots. See the descriptions of F, V, Z. +! Q is from the inital QR facorization. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will +! be computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBQ (input) CHARACTER*1 +! Specifies whether to explicitly compute and return the +! unitary matrix from the QR factorization. +! 'Q' :: The matrix Q of the QR factorization of the data +! snapshot matrix is computed and stored in the +! array F. See the description of F. +! 'N' :: The matrix Q is not explicitly computed. +!..... +! JOBT (input) CHARACTER*1 +! Specifies whether to return the upper triangular factor +! from the QR factorization. +! 'R' :: The matrix R of the QR factorization of the data +! snapshot matrix F is returned in the array Y. +! See the description of Y and Further details. +! 'N' :: The matrix R is not returned. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +! To be useful on exit, this option needs JOBQ='Q'. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: CGESVD (the QR SVD algorithm) +! 2 :: CGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M >= 0 +! The state space dimension (the number of rows of F). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshots from a single trajectory, +! taken at equidistant discrete times. This is the +! number of columns of F. +!..... +! F (input/output) COMPLEX(KIND=WP) M-by-N array +! > On entry, +! the columns of F are the sequence of data snapshots +! from a single trajectory, taken at equidistant discrete +! times. It is assumed that the column norms of F are +! in the range of the normalized floating point numbers. +! < On exit, +! If JOBQ == 'Q', the array F contains the orthogonal +! matrix/factor of the QR factorization of the initial +! data snapshots matrix F. See the description of JOBQ. +! If JOBQ == 'N', the entries in F strictly below the main +! diagonal contain, column-wise, the information on the +! Householder vectors, as returned by CGEQRF. The +! remaining information to restore the orthogonal matrix +! of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). +! See the description of ZWORK. +!..... +! LDF (input) INTEGER, LDF >= M +! The leading dimension of the array F. +!..... +! X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array +! X is used as workspace to hold representations of the +! leading N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, the leading K columns of X contain the leading +! K left singular vectors of the above described content +! of X. To lift them to the space of the left singular +! vectors U(:,1:K) of the input data, pre-multiply with the +! Q factor from the initial QR factorization. +! See the descriptions of F, K, V and Z. +!..... +! LDX (input) INTEGER, LDX >= N +! The leading dimension of the array X. +!..... +! Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array +! Y is used as workspace to hold representations of the +! trailing N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, +! If JOBT == 'R', Y contains the MIN(M,N)-by-N upper +! triangular factor from the QR factorization of the data +! snapshot matrix F. +!..... +! LDY (input) INTEGER , LDY >= N +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N-1 :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the description of K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the SVD/POD basis for the leading N-1 +! data snapshots (columns of F) and the number of the +! computed Ritz pairs. The value of K is determined +! according to the rule set by the parameters NRNK and +! TOL. See the descriptions of NRNK and TOL. +!..... +! EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array +! The leading K (K<=N-1) entries of EIGS contain +! the computed eigenvalues (Ritz values). +! See the descriptions of K, and Z. +!..... +! Z (workspace/output) COMPLEX(KIND=WP) M-by-(N-1) array +! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +! Z*V, where Z contains orthonormal matrix (the product of +! Q from the initial QR factorization and the SVD/POD_basis +! returned by CGEDMD in X) and the second factor (the +! eigenvectors of the Rayleigh quotient) is in the array V, +! as returned by CGEDMD. That is, X(:,1:K)*V(:,i) +! is an eigenvector corresponding to EIGS(i). The columns +! of V(1:K,1:K) are the computed eigenvectors of the +! K-by-K Rayleigh quotient. +! See the descriptions of EIGS, X and V. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) (N-1)-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs, +! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +! See the description of EIGS and Z. +!..... +! B (output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array. +! IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:N,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! In both cases, the content of B can be lifted to the +! original dimension of the input data by pre-multiplying +! with the Q factor from the initial QR factorization. +! Here A denotes a compression of the underlying operator. +! See the descriptions of F and X. +! If JOBF =='N', then B is not referenced. +!..... +! LDB (input) INTEGER, LDB >= MIN(M,N) +! The leading dimension of the array B. +!..... +! V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array +! On exit, V(1:K,1:K) V contains the K eigenvectors of +! the Rayleigh quotient. The Ritz vectors +! (returned in Z) are the product of Q from the initial QR +! factorization (see the description of F) X (see the +! description of X) and V. +!..... +! LDV (input) INTEGER, LDV >= N-1 +! The leading dimension of the array V. +!..... +! S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by CGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N-1 +! The leading dimension of the array S. +!..... +! ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array +! On exit, +! ZWORK(1:MIN(M,N)) contains the scalar factors of the +! elementary reflectors as returned by CGEQRF of the +! M-by-N input matrix F. +! If the call to CGEDMDQ is only workspace query, then +! ZWORK(1) contains the minimal complex workspace length and +! ZWORK(2) is the optimal complex workspace length. +! Hence, the length of work is at least 2. +! See the description of LZWORK. +!..... +! LZWORK (input) INTEGER +! The minimal length of the workspace vector ZWORK. +! LZWORK is calculated as follows: +! Let MLWQR = N (minimal workspace for CGEQRF[M,N]) +! MLWDMD = minimal workspace for CGEDMD (see the +! description of LWORK in CGEDMD) +! MLWMQR = N (minimal workspace for +! ZUNMQR['L','N',M,N,N]) +! MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) +! MINMN = MIN(M,N) +! Then +! LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) +! is further updated as follows: +! if JOBZ == 'V' or JOBZ == 'F' THEN +! LZWORK = MAX( LZWORK, MINMN+MLWMQR ) +! if JOBQ == 'Q' THEN +! LZWORK = MAX( ZLWORK, MINMN+MLWGQR) +! +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, +! WORK(1:N-1) contains the singular values of +! the input submatrix F(1:M,1:N-1). +! If the call to CGEDMDQ is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is the same as in CGEDMD, because in CGEDMDQ +! only CGEDMD requires real workspace for snapshots +! of dimensions MIN(M,N)-by-(N-1). +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! Let M1=MIN(M,N), N1=N-1. Then +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +! +! Local scalars +! ~~~~~~~~~~~~~ + INTEGER :: IMINWR, INFO1, MINMN, MLRWRK, & + MLWDMD, MLWGQR, MLWMQR, MLWORK, & + MLWQR, OLWDMD, OLWGQR, OLWMQR, & + OLWORK, OLWQR + LOGICAL :: LQUERY, SCCOLX, SCCOLY, WANTQ, & + WNTTRF, WNTRES, WNTVEC, WNTVCF, & + WNTVCQ, WNTREF, WNTEX + CHARACTER(LEN=1) :: JOBVL +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + LOGICAL LSAME + EXTERNAL LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL CGEQRF, CLACPY, CLASET, CUNGQR, & + CUNMQR, XERBLA + +! External subroutines +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL CGEDMD + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC MAX, MIN, INT + !.......................................................... + ! + ! Test the input arguments + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' ) + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTVCF = LSAME(JOBZ,'F') + WNTVCQ = LSAME(JOBZ,'Q') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + WANTQ = LSAME(JOBQ,'Q') + WNTTRF = LSAME(JOBT,'R') + MINMN = MIN(M,N) + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ & + .OR. LSAME(JOBZ,'N')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN + INFO = -4 + ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) ) THEN + INFO = -5 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -6 + ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR. & + (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN + INFO = -7 + ELSE IF ( M < 0 ) THEN + INFO = -8 + ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN + INFO = -9 + ELSE IF ( LDF < M ) THEN + INFO = -11 + ELSE IF ( LDX < MINMN ) THEN + INFO = -13 + ELSE IF ( LDY < MINMN ) THEN + INFO = -15 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -16 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -17 + ELSE IF ( LDZ < M ) THEN + INFO = -21 + ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN + INFO = -24 + ELSE IF ( LDV < N-1 ) THEN + INFO = -26 + ELSE IF ( LDS < N-1 ) THEN + INFO = -28 + END IF +! + IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN + JOBVL = 'V' + ELSE + JOBVL = 'N' + END IF + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN + ! All output except K is void. INFO=1 signals + ! the void input. In case of a workspace query, + ! the minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + + MLRWRK = 2 + MLWORK = 2 + OLWORK = 2 + IMINWR = 1 + MLWQR = MAX(1,N) ! Minimal workspace length for CGEQRF. + MLWORK = MAX(MLWORK,MINMN + MLWQR) + + IF ( LQUERY ) THEN + CALL CGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, & + INFO1 ) + OLWQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,MINMN + OLWQR) + END IF + CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + EIGS, Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK, LZWORK, WORK, -1, IWORK,& + LIWORK, INFO1 ) + MLWDMD = INT(ZWORK(1)) + MLWORK = MAX(MLWORK, MINMN + MLWDMD) + MLRWRK = MAX(MLRWRK, INT(WORK(1))) + IMINWR = MAX(IMINWR, IWORK(1)) + IF ( LQUERY ) THEN + OLWDMD = INT(ZWORK(2)) + OLWORK = MAX(OLWORK, MINMN+OLWDMD) + END IF + IF ( WNTVEC .OR. WNTVCF ) THEN + MLWMQR = MAX(1,N) + MLWORK = MAX(MLWORK, MINMN+MLWMQR) + IF ( LQUERY ) THEN + CALL CUNMQR( 'L','N', M, N, MINMN, F, LDF, & + ZWORK, Z, LDZ, ZWORK, -1, INFO1 ) + OLWMQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK, MINMN+OLWMQR) + END IF + END IF + IF ( WANTQ ) THEN + MLWGQR = MAX(1,N) + MLWORK = MAX(MLWORK, MINMN+MLWGQR) + IF ( LQUERY ) THEN + CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, & + ZWORK, -1, INFO1 ) + OLWGQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK, MINMN+OLWGQR) + END IF + END IF + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34 + IF ( LWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32 + IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30 + END IF + IF( INFO /= 0 ) THEN + CALL XERBLA( 'CGEDMDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + ZWORK(1) = MLWORK + ZWORK(2) = OLWORK + WORK(1) = MLRWRK + WORK(2) = MLRWRK + RETURN + END IF +!..... +! Initial QR factorization that is used to represent the +! snapshots as elements of lower dimensional subspace. +! For large scale computation with M >>N , at this place +! one can use an out of core QRF. +! + CALL CGEQRF( M, N, F, LDF, ZWORK, & + ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) +! +! Define X and Y as the snapshots representations in the +! orthogonal basis computed in the QR factorization. +! X corresponds to the leading N-1 and Y to the trailing +! N-1 snapshots. + CALL CLASET( 'L', MINMN, N-1, ZZERO, ZZERO, X, LDX ) + CALL CLACPY( 'U', MINMN, N-1, F, LDF, X, LDX ) + CALL CLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY ) + IF ( M >= 3 ) THEN + CALL CLASET( 'L', MINMN-2, N-2, ZZERO, ZZERO, & + Y(3,1), LDY ) + END IF +! +! Compute the DMD of the projected snapshot pairs (X,Y) + CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, & + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + EIGS, Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK(MINMN+1), LZWORK-MINMN, & + WORK, LWORK, IWORK, LIWORK, INFO1 ) + IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN + ! Return with error code. See CGEDMD for details. + INFO = INFO1 + RETURN + ELSE + INFO = INFO1 + END IF +! +! The Ritz vectors (Koopman modes) can be explicitly +! formed or returned in factored form. + IF ( WNTVEC ) THEN + ! Compute the eigenvectors explicitly. + IF ( M > MINMN ) CALL CLASET( 'A', M-MINMN, K, ZZERO, & + ZZERO, Z(MINMN+1,1), LDZ ) + CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, & + LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + ELSE IF ( WNTVCF ) THEN + ! Return the Ritz vectors (eigenvectors) in factored + ! form Z*V, where Z contains orthonormal matrix (the + ! product of Q from the initial QR factorization and + ! the SVD/POD_basis returned by CGEDMD in X) and the + ! second factor (the eigenvectors of the Rayleigh + ! quotient) is in the array V, as returned by CGEDMD. + CALL CLACPY( 'A', N, K, X, LDX, Z, LDZ ) + IF ( M > N ) CALL CLASET( 'A', M-N, K, ZZERO, ZZERO, & + Z(N+1,1), LDZ ) + CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, & + LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + END IF +! +! Some optional output variables: +! +! The upper triangular factor R in the initial QR +! factorization is optionally returned in the array Y. +! This is useful if this call to CGEDMDQ is to be + +! followed by a streaming DMD that is implemented in a +! QR compressed form. + IF ( WNTTRF ) THEN ! Return the upper triangular R in Y + CALL CLASET( 'A', MINMN, N, ZZERO, ZZERO, Y, LDY ) + CALL CLACPY( 'U', MINMN, N, F, LDF, Y, LDY ) + END IF +! +! The orthonormal/unitary factor Q in the initial QR +! factorization is optionally returned in the array F. +! Same as with the triangular factor above, this is +! useful in a streaming DMD. + IF ( WANTQ ) THEN ! Q overwrites F + CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, & + ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + END IF +! + RETURN +! + END SUBROUTINE CGEDMDQ + \ No newline at end of file diff --git a/lapack-netlib/SRC/cgees.f b/lapack-netlib/SRC/cgees.f index 71acfdba3..2085dc49b 100644 --- a/lapack-netlib/SRC/cgees.f +++ b/lapack-netlib/SRC/cgees.f @@ -189,7 +189,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gees * * ===================================================================== SUBROUTINE CGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, W, VS, @@ -230,13 +230,13 @@ * .. * .. External Subroutines .. EXTERNAL CCOPY, CGEBAK, CGEBAL, CGEHRD, CHSEQR, CLACPY, - $ CLASCL, CTRSEN, CUNGHR, SLABAD, XERBLA + $ CLASCL, CTRSEN, CUNGHR, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -292,7 +292,7 @@ MAXWRK = MAX( MAXWRK, HSWORK ) END IF END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -12 @@ -318,7 +318,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -413,7 +412,7 @@ CALL CCOPY( N, A, LDA+1, W, 1 ) END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGEES diff --git a/lapack-netlib/SRC/cgeesx.f b/lapack-netlib/SRC/cgeesx.f index 782e36747..036ae90c2 100644 --- a/lapack-netlib/SRC/cgeesx.f +++ b/lapack-netlib/SRC/cgeesx.f @@ -230,7 +230,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup geesx * * ===================================================================== SUBROUTINE CGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, W, @@ -274,13 +274,13 @@ * .. * .. External Subroutines .. EXTERNAL CCOPY, CGEBAK, CGEBAL, CGEHRD, CHSEQR, CLACPY, - $ CLASCL, CTRSEN, CUNGHR, SLABAD, SLASCL, XERBLA + $ CLASCL, CTRSEN, CUNGHR, SLASCL, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -350,7 +350,7 @@ IF( .NOT.WANTSN ) $ LWRK = MAX( LWRK, ( N*N )/2 ) END IF - WORK( 1 ) = LWRK + WORK( 1 ) = SROUNDUP_LWORK(LWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -15 @@ -376,7 +376,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -488,7 +487,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGEESX diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f index a77525ef8..bb41599d1 100644 --- a/lapack-netlib/SRC/cgeev.f +++ b/lapack-netlib/SRC/cgeev.f @@ -172,7 +172,7 @@ * * @generated from zgeev.f, fortran z -> c, Tue Apr 19 01:47:44 2016 * -*> \ingroup complexGEeigen +*> \ingroup geev * * ===================================================================== SUBROUTINE CGEEV( JOBVL, JOBVR, N, A, LDA, W, VL, LDVL, VR, LDVR, @@ -212,14 +212,15 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SLABAD, XERBLA, CSSCAL, CGEBAK, CGEBAL, CGEHRD, + EXTERNAL XERBLA, CSSCAL, CGEBAK, CGEBAL, CGEHRD, $ CHSEQR, CLACPY, CLASCL, CSCAL, CTREVC3, CUNGHR * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, ILAENV - REAL SLAMCH, SCNRM2, CLANGE - EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE + REAL SLAMCH, SCNRM2, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC REAL, CMPLX, CONJG, AIMAG, MAX, SQRT @@ -291,7 +292,7 @@ HSWORK = INT( WORK(1) ) MAXWRK = MAX( MAXWRK, HSWORK, MINWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -12 @@ -315,7 +316,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -493,7 +493,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGEEV diff --git a/lapack-netlib/SRC/cgeevx.f b/lapack-netlib/SRC/cgeevx.f index 2388f5acc..5dbc394e9 100644 --- a/lapack-netlib/SRC/cgeevx.f +++ b/lapack-netlib/SRC/cgeevx.f @@ -279,7 +279,7 @@ * * @generated from zgeevx.f, fortran z -> c, Tue Apr 19 01:47:44 2016 * -*> \ingroup complexGEeigen +*> \ingroup geevx * * ===================================================================== SUBROUTINE CGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, W, VL, @@ -323,15 +323,16 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SLABAD, SLASCL, XERBLA, CSSCAL, CGEBAK, CGEBAL, + EXTERNAL SLASCL, XERBLA, CSSCAL, CGEBAK, CGEBAL, $ CGEHRD, CHSEQR, CLACPY, CLASCL, CSCAL, CTREVC3, $ CTRSNA, CUNGHR * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, ILAENV - REAL SLAMCH, SCNRM2, CLANGE - EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE + REAL SLAMCH, SCNRM2, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC REAL, CMPLX, CONJG, AIMAG, MAX, SQRT @@ -434,7 +435,7 @@ END IF MAXWRK = MAX( MAXWRK, MINWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -458,7 +459,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -657,7 +657,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGEEVX diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f index d9c050267..7ba87cc01 100644 --- a/lapack-netlib/SRC/cgehrd.f +++ b/lapack-netlib/SRC/cgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -201,7 +201,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -221,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = LWKOPT + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -248,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -344,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = LWKOPT +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 062ac182b..1fc75613e 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -52,10 +52,10 @@ *> are computed and stored in the arrays U and V, respectively. The diagonal *> of [SIGMA] is computed and stored in the array SVA. *> \endverbatim -*> -*> Arguments: -*> ========== -*> +* +* Arguments: +* ========== +* *> \param[in] JOBA *> \verbatim *> JOBA is CHARACTER*1 @@ -151,7 +151,7 @@ *> transposed A if A^* seems to be better with respect to convergence. *> If the matrix is not square, JOBT is ignored. *> The decision is based on two values of entropy over the adjoint -*> orbit of A^* * A. See the descriptions of WORK(6) and WORK(7). +*> orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7). *> = 'T': transpose if entropy test indicates possibly faster *> convergence of Jacobi process if A^* is taken as input. If A is *> replaced with A^*, then the row pivoting is included automatically. @@ -209,11 +209,11 @@ *> \verbatim *> SVA is REAL array, dimension (N) *> On exit, -*> - For WORK(1)/WORK(2) = ONE: The singular values of A. During the -*> computation SVA contains Euclidean column norms of the +*> - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During +*> the computation SVA contains Euclidean column norms of the *> iterated matrices in the array A. -*> - For WORK(1) .NE. WORK(2): The singular values of A are -*> (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if +*> - For RWORK(1) .NE. RWORK(2): The singular values of A are +*> (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if *> sigma_max(A) overflows or if small singular values have been *> saved from underflow by scaling the input matrix A. *> - If JOBR='R' then some of the singular values may be returned @@ -252,7 +252,7 @@ *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; *> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), -*> then V is used as workspace if the pprocedure +*> then V is used as workspace if the procedure *> replaces A with A^*. In that case, [U] is computed *> in V as right singular vectors of A^* and then *> copied back to the U array. This 'W' option is just @@ -1819,7 +1819,7 @@ IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 * (this overwrites the copy of R2, as it will not be -* needed in this branch, but it does not overwritte the +* needed in this branch, but it does not overwrite the * Huseholder vectors of Q2.). CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) * .. and the rest of the information on Q3 is in @@ -1842,7 +1842,7 @@ END IF * * Second preconditioning finished; continue with Jacobi SVD -* The input matrix is lower trinagular. +* The input matrix is lower triangular. * * Recover the right singular vectors as solution of a well * conditioned triangular matrix equation. @@ -1886,7 +1886,7 @@ ELSE IF ( CONDR2 .LT. COND_OK ) THEN * * The matrix R2 is inverted. The solution of the matrix equation -* is Q3^* * V3 = the product of the Jacobi rotations (appplied to +* is Q3^* * V3 = the product of the Jacobi rotations (applied to * the lower triangular L3 from the LQ factorization of * R2=L3*Q3), pre-multiplied with the transposed Q3. CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index d66033166..24aaa982e 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -190,7 +192,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGELQT, CLASWLQ, XERBLA @@ -292,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = LWREQ + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -319,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = LWREQ + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f index 37ef13a27..3847a958a 100644 --- a/lapack-netlib/SRC/cgelqf.f +++ b/lapack-netlib/SRC/cgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -167,16 +168,16 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -266,7 +273,7 @@ $ CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGELQF diff --git a/lapack-netlib/SRC/cgelqt3.f b/lapack-netlib/SRC/cgelqt3.f index 1dfbd3f2b..553087bf4 100644 --- a/lapack-netlib/SRC/cgelqt3.f +++ b/lapack-netlib/SRC/cgelqt3.f @@ -159,7 +159,8 @@ * * Compute Householder transform when M=1 * - CALL CLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL CLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) T(1,1)=CONJG(T(1,1)) * ELSE diff --git a/lapack-netlib/SRC/cgelsd.f b/lapack-netlib/SRC/cgelsd.f index fce4ca6e2..5d7eec68d 100644 --- a/lapack-netlib/SRC/cgelsd.f +++ b/lapack-netlib/SRC/cgelsd.f @@ -60,12 +60,6 @@ *> singular values which are less than RCOND times the largest singular *> value. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -210,7 +204,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -255,13 +249,13 @@ * .. External Subroutines .. EXTERNAL CGEBRD, CGELQF, CGEQRF, CLACPY, $ CLALSD, CLASCL, CLASET, CUNMBR, - $ CUNMLQ, CUNMQR, SLABAD, SLASCL, + $ CUNMLQ, CUNMQR, SLASCL, $ SLASET, XERBLA * .. * .. External Functions .. INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL CLANGE, SLAMCH, ILAENV + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL CLANGE, SLAMCH, ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, LOG, MAX, MIN, REAL @@ -373,7 +367,7 @@ END IF END IF MINWRK = MIN( MINWRK, MAXWRK ) - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWORK RWORK( 1 ) = LRWORK * @@ -402,7 +396,6 @@ SFMIN = SLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * @@ -653,7 +646,7 @@ END IF * 10 CONTINUE - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWORK RWORK( 1 ) = LRWORK RETURN diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f index da6b9092f..00d7f596a 100644 --- a/lapack-netlib/SRC/cgelss.f +++ b/lapack-netlib/SRC/cgelss.f @@ -170,7 +170,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsolve +*> \ingroup gelss * * ===================================================================== SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, @@ -214,13 +214,12 @@ * .. External Subroutines .. EXTERNAL CBDSQR, CCOPY, CGEBRD, CGELQF, CGEMM, CGEMV, $ CGEQRF, CLACPY, CLASCL, CLASET, CSRSCL, CUNGBR, - $ CUNMBR, CUNMLQ, CUNMQR, SLABAD, SLASCL, SLASET, - $ XERBLA + $ CUNMBR, CUNMLQ, CUNMQR, SLASCL, SLASET, XERBLA * .. * .. External Functions .. INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -362,7 +361,7 @@ END IF MAXWRK = MAX( MINWRK, MAXWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) $ INFO = -12 @@ -388,7 +387,6 @@ SFMIN = SLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A if max element outside range [SMLNUM,BIGNUM] * @@ -540,7 +538,7 @@ $ LDB, CZERO, WORK, N ) CALL CLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB ) 20 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL CGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 ) CALL CCOPY( N, WORK, 1, B, 1 ) END IF @@ -645,7 +643,7 @@ CALL CLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ), $ LDB ) 40 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL CGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ), $ 1, CZERO, WORK( IWORK ), 1 ) CALL CCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 ) @@ -737,7 +735,7 @@ $ LDB, CZERO, WORK, N ) CALL CLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB ) 60 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL CGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 ) CALL CCOPY( N, WORK, 1, B, 1 ) END IF @@ -760,7 +758,7 @@ CALL CLASCL( 'G', 0, 0, BIGNUM, BNRM, N, NRHS, B, LDB, INFO ) END IF 70 CONTINUE - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGELSS diff --git a/lapack-netlib/SRC/cgelst.f b/lapack-netlib/SRC/cgelst.f index 7d8e44ddf..b69626934 100644 --- a/lapack-netlib/SRC/cgelst.f +++ b/lapack-netlib/SRC/cgelst.f @@ -176,7 +176,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsolve +*> \ingroup gelst * *> \par Contributors: * ================== @@ -224,15 +224,15 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL CGELQT, CGEQRT, CGEMLQT, CGEMQRT, SLABAD, + EXTERNAL CGELQT, CGEQRT, CGEMLQT, CGEMQRT, $ CLASCL, CLASET, CTRTRS, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -270,7 +270,7 @@ * MNNRHS = MAX( MN, NRHS ) LWOPT = MAX( 1, (MN+MNNRHS)*NB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) * END IF * @@ -285,7 +285,7 @@ * IF( MIN( M, N, NRHS ).EQ.0 ) THEN CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) RETURN END IF * @@ -311,7 +311,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -334,7 +333,7 @@ * Matrix all zero. Return zero solution. * CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) RETURN END IF * @@ -524,7 +523,7 @@ $ INFO ) END IF * - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgelsy.f b/lapack-netlib/SRC/cgelsy.f index 67140f191..64bb10023 100644 --- a/lapack-netlib/SRC/cgelsy.f +++ b/lapack-netlib/SRC/cgelsy.f @@ -116,6 +116,7 @@ *> B is COMPLEX array, dimension (LDB,NRHS) *> On entry, the M-by-NRHS right hand side matrix B. *> On exit, the N-by-NRHS solution matrix X. +*> If M = 0 or N = 0, B is not referenced. *> \endverbatim *> *> \param[in] LDB @@ -148,6 +149,7 @@ *> The effective rank of A, i.e., the order of the submatrix *> R11. This is the same as the order of the submatrix T11 *> in the complete orthogonal factorization of A. +*> If NRHS = 0, RANK = 0 on output. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f index e0cf78bc0..e5b02b669 100644 --- a/lapack-netlib/SRC/cgemlq.f +++ b/lapack-netlib/SRC/cgemlq.f @@ -110,16 +110,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -143,7 +144,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -159,11 +160,13 @@ *> block sizes MB and NB returned by ILAENV, CGELQ will use either *> CLASWLQ (if the matrix is wide-and-short) or CGELQT to compute *> the LQ factorization. -*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to +*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in CLAMSWLQ or CGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE CGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -185,11 +188,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAMSWLQ, CGEMLQT, XERBLA @@ -201,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -216,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -244,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -261,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -274,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f index ea9de146e..0b7dd9dd7 100644 --- a/lapack-netlib/SRC/cgemqr.f +++ b/lapack-netlib/SRC/cgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE CGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMQRT, CLAMTSQR, XERBLA @@ -203,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -251,7 +262,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f index d2c11c269..6c67344c5 100644 --- a/lapack-netlib/SRC/cgeqlf.f +++ b/lapack-netlib/SRC/cgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -162,7 +163,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -186,10 +188,11 @@ NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * @@ -276,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQLF diff --git a/lapack-netlib/SRC/cgeqp3rk.c b/lapack-netlib/SRC/cgeqp3rk.c new file mode 100644 index 000000000..54e7fb140 --- /dev/null +++ b/lapack-netlib/SRC/cgeqp3rk.c @@ -0,0 +1,1071 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + return 0; + } + +/* ================================================================== */ + + eps = slamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.f) { + safmin = slamch_("Safe minimum"); +/* Computing MAX */ + r__1 = *abstol, r__2 = safmin * 2.f; + *abstol = f2cmax(r__1,r__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.f) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1.f <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + claqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], & + rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], + &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + claqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], & + work[1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &rwork[*k + 1], &c__1); + *maxc2nrmk = rwork[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + + return 0; + +/* End of CGEQP3RK */ + +} /* cgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f new file mode 100644 index 000000000..731c44edb --- /dev/null +++ b/lapack-netlib/SRC/cgeqp3rk.f @@ -0,0 +1,1093 @@ +*> \brief \b CGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, RWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL RWORK( * ) +* COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a complex +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M unitary matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS +*> matrix B with Q(K)**H * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**H * B, the matrix B with the unitary +*> transformation Q(K)**H applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M unitary matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**H, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a complex scalar, +*> v is a complex vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> unitary matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**H * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. +*> For optimal performance LWORK >= NB*( N+NRHS+1 ), +*> where NB is the optimal block size for CGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine CLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> CGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in CGEQP3 routine which uses +*> CLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, RWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL RWORK( * ) + COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + REAL ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) + COMPLEX CZERO + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + REAL EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL CLAQP2RK, CLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX, ILAENV + REAL SLAMCH, SCNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( SISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( SISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in CLAQP2RK. +* 1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in CLARF subroutine inside CLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'CGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code +* in CLAQP3RK. +* 1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in CLARF subroutine to apply an elementary reflector +* from the left. +* 3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) CLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + RWORK( J ) = SCNRM2( M, A( 1, J ), 1 ) + RWORK( N+J ) = RWORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = ISAMAX( N, RWORK( 1 ), 1 ) +* +* ==================================================================. +* + IF( SISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = SLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = CZERO + END DO + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = SLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = SLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'CGEQP3RK', ' ', M, N, -1, -1 ) ) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'CGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL CLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), WORK( JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL CLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + ISAMAX( N-K, RWORK( K+1 ), 1 ) + MAXC2NRMK = RWORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = CZERO + END DO +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + RETURN +* +* End of CGEQP3RK +* + END diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f index d10e3da65..3617594d0 100644 --- a/lapack-netlib/SRC/cgeqr.f +++ b/lapack-netlib/SRC/cgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATSQR, CGEQRT, XERBLA @@ -244,8 +247,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +258,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +273,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +287,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +314,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqrf.f b/lapack-netlib/SRC/cgeqrf.f index d71bd5b33..bf22a2cd3 100644 --- a/lapack-netlib/SRC/cgeqrf.f +++ b/lapack-netlib/SRC/cgeqrf.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup geqrf * *> \par Further Details: * ===================== @@ -170,7 +170,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -199,7 +200,7 @@ ELSE LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -274,7 +275,7 @@ $ CALL CGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGEQRF diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f index 995404f43..5b6226c67 100644 --- a/lapack-netlib/SRC/cgeqrfp.f +++ b/lapack-netlib/SRC/cgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL CGEQR2P, CLARFB, CLARFT, XERBLA @@ -173,7 +174,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,8 +183,16 @@ * INFO = 0 NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +200,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +212,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +219,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -272,7 +281,7 @@ $ CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQRFP diff --git a/lapack-netlib/SRC/cgerqf.f b/lapack-netlib/SRC/cgerqf.f index d2247844c..6f914c892 100644 --- a/lapack-netlib/SRC/cgerqf.f +++ b/lapack-netlib/SRC/cgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -163,7 +163,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -187,7 +188,7 @@ NB = ILAENV( 1, 'CGERQF', ' ', M, N, -1, -1 ) LWKOPT = M*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF ( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) @@ -278,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL CGERQ2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGERQF diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f index b824374d3..1838629ae 100644 --- a/lapack-netlib/SRC/cgesdd.f +++ b/lapack-netlib/SRC/cgesdd.f @@ -53,12 +53,6 @@ *> *> Note that the routine returns VT = V**H, not V. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/cgesvd.f b/lapack-netlib/SRC/cgesvd.f index 239b13431..6165a6acf 100644 --- a/lapack-netlib/SRC/cgesvd.f +++ b/lapack-netlib/SRC/cgesvd.f @@ -206,7 +206,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup gesvd * * ===================================================================== SUBROUTINE CGESVD( JOBU, JOBVT, M, N, A, LDA, S, U, LDU, VT, LDVT, @@ -259,8 +259,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -615,7 +615,7 @@ END IF END IF MAXWRK = MAX( MINWRK, MAXWRK ) - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -13 @@ -3694,7 +3694,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * RETURN * diff --git a/lapack-netlib/SRC/cgesvdq.f b/lapack-netlib/SRC/cgesvdq.f index 1a587eca3..95091c2c0 100644 --- a/lapack-netlib/SRC/cgesvdq.f +++ b/lapack-netlib/SRC/cgesvdq.f @@ -363,7 +363,7 @@ *> an optimal implementation would do all necessary scaling before calling *> CGESVD and the scaling in CGESVD can be switched off. *> 3. Other comments related to code optimization are given in comments in the -*> code, enlosed in [[double brackets]]. +*> code, enclosed in [[double brackets]]. *> \endverbatim * *> \par Bugs, examples and comments diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f index fbdb121ca..e1856a65f 100644 --- a/lapack-netlib/SRC/cgesvdx.f +++ b/lapack-netlib/SRC/cgesvdx.f @@ -208,7 +208,7 @@ *> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (MAX(1,LWORK)) -*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK; +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup gesvdx * * ===================================================================== SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, @@ -312,8 +312,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -448,7 +448,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -19 @@ -464,7 +464,7 @@ * * Quick return if possible * - IF( M.EQ.0 .OR. N.EQ.0 ) THEN + IF( MINMN.EQ.0 ) THEN RETURN END IF * @@ -846,7 +846,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * RETURN * diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f index 149cf5e48..b9c8f1709 100644 --- a/lapack-netlib/SRC/cgesvj.f +++ b/lapack-netlib/SRC/cgesvj.f @@ -208,15 +208,17 @@ *> \verbatim *> CWORK is COMPLEX array, dimension (max(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -374,16 +378,17 @@ PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX AAPQ, OMPQ - REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX AAPQ, OMPQ + REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND, + $ MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, + $ RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -398,8 +403,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -422,7 +427,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M + N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +456,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.( M+N ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( LRWORK.LT.MAX( N, 6 ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +468,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/cgetf2.f b/lapack-netlib/SRC/cgetf2.f index aac989970..995ee40ec 100644 --- a/lapack-netlib/SRC/cgetf2.f +++ b/lapack-netlib/SRC/cgetf2.f @@ -101,7 +101,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup getf2 * * ===================================================================== SUBROUTINE CGETF2( M, N, A, LDA, IPIV, INFO ) @@ -126,16 +126,14 @@ $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. - REAL SFMIN - INTEGER I, J, JP + INTEGER J, JP * .. * .. External Functions .. - REAL SLAMCH INTEGER ICAMAX - EXTERNAL SLAMCH, ICAMAX + EXTERNAL ICAMAX * .. * .. External Subroutines .. - EXTERNAL CGERU, CSCAL, CSWAP, XERBLA + EXTERNAL CGERU, CRSCL, CSWAP, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -161,10 +159,6 @@ * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN -* -* Compute machine safe minimum -* - SFMIN = SLAMCH('S') * DO 10 J = 1, MIN( M, N ) * @@ -181,15 +175,8 @@ * * Compute elements J+1:M of J-th column. * - IF( J.LT.M ) THEN - IF( ABS(A( J, J )) .GE. SFMIN ) THEN - CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) - ELSE - DO 20 I = 1, M-J - A( J+I, J ) = A( J+I, J ) / A( J, J ) - 20 CONTINUE - END IF - END IF + IF( J.LT.M ) + $ CALL CRSCL( M-J, A( J, J ), A( J+1, J ), 1 ) * ELSE IF( INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f index bd7fc286c..2eb3da7ab 100644 --- a/lapack-netlib/SRC/cgetri.f +++ b/lapack-netlib/SRC/cgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE CGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -138,7 +138,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMM, CGEMV, CSWAP, CTRSM, CTRTRI, XERBLA @@ -152,8 +153,8 @@ * INFO = 0 NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -251,7 +252,7 @@ $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGETRI diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index 8a4d02224..3f43dc8de 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE CGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -191,15 +191,15 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, CLANGE - EXTERNAL LSAME, SLABAD, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEQR, CGEMQR, CLASCL, CLASET, $ CTRTRS, XERBLA, CGELQ, CGEMLQ * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN, INT + INTRINSIC MAX, MIN, INT * .. * .. Executable Statements .. * @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF ( M.GE.N ) THEN CALL CGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -265,7 +268,7 @@ INFO = -10 END IF * - WORK( 1 ) = REAL( WSIZEO ) + WORK( 1 ) = SROUNDUP_LWORK( WSIZEO ) * END IF * @@ -274,7 +277,7 @@ RETURN END IF IF( LQUERY ) THEN - IF( LWORK.EQ.-2 ) WORK( 1 ) = REAL( WSIZEM ) + IF( LWORK.EQ.-2 ) WORK( 1 ) = SROUNDUP_LWORK( WSIZEM ) RETURN END IF IF( LWORK.LT.WSIZEO ) THEN @@ -297,7 +300,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -485,7 +487,7 @@ END IF * 50 CONTINUE - WORK( 1 ) = REAL( TSZO + LWO ) + WORK( 1 ) = SROUNDUP_LWORK( TSZO + LWO ) RETURN * * End of CGETSLS diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f index 4e4dc1d4a..087e9bc7f 100644 --- a/lapack-netlib/SRC/cgetsqrhrt.f +++ b/lapack-netlib/SRC/cgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpexOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -200,6 +202,10 @@ INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL, $ XERBLA @@ -212,7 +218,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +231,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +269,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -277,14 +284,14 @@ CALL XERBLA( 'CGETSQRHRT', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -341,9 +348,9 @@ END IF END DO * - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of CGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/cgges.f b/lapack-netlib/SRC/cgges.f index c54174da4..0ff848735 100644 --- a/lapack-netlib/SRC/cgges.f +++ b/lapack-netlib/SRC/cgges.f @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE CGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -312,14 +312,13 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -390,7 +389,7 @@ LWKOPT = MAX( LWKOPT, N + $ N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, -1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -18 @@ -415,7 +414,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -587,7 +585,7 @@ * 30 CONTINUE * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cgges3.f b/lapack-netlib/SRC/cgges3.f index aac9f9510..c1ca79688 100644 --- a/lapack-netlib/SRC/cgges3.f +++ b/lapack-netlib/SRC/cgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE CGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -310,13 +312,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,29 +380,33 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL CUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) END IF CALL CGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL CLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL CTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) END IF * @@ -422,7 +429,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -585,7 +591,7 @@ * 30 CONTINUE * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggesx.f b/lapack-netlib/SRC/cggesx.f index 6385a74c1..3bf460fac 100644 --- a/lapack-netlib/SRC/cggesx.f +++ b/lapack-netlib/SRC/cggesx.f @@ -320,7 +320,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggesx * * ===================================================================== SUBROUTINE CGGESX( JOBVSL, JOBVSR, SORT, SELCTG, SENSE, N, A, LDA, @@ -373,14 +373,13 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -476,7 +475,7 @@ MAXWRK = 1 LWRK = 1 END IF - WORK( 1 ) = LWRK + WORK( 1 ) = SROUNDUP_LWORK(LWRK) IF( WANTSN .OR. N.EQ.0 ) THEN LIWMIN = 1 ELSE @@ -510,7 +509,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -705,7 +703,7 @@ * 40 CONTINUE * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cggev.f b/lapack-netlib/SRC/cggev.f index c1c28a180..cf16e3079 100644 --- a/lapack-netlib/SRC/cggev.f +++ b/lapack-netlib/SRC/cggev.f @@ -209,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggev * * ===================================================================== SUBROUTINE CGGEV( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -254,14 +254,13 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY, - $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -336,7 +335,7 @@ LWKOPT = MAX( LWKOPT, N + $ N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, -1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -15 @@ -359,7 +358,6 @@ EPS = SLAMCH( 'E' )*SLAMCH( 'B' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -547,7 +545,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CGGEV diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f index 9483ecdeb..d2b75aebc 100644 --- a/lapack-netlib/SRC/cggev3.f +++ b/lapack-netlib/SRC/cggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE CGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKOPT, LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX X @@ -253,13 +254,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( N, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = SLAMCH( 'E' )*SLAMCH( 'B' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -549,7 +553,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CGGEV3 diff --git a/lapack-netlib/SRC/cggevx.f b/lapack-netlib/SRC/cggevx.f index 405c9c3b5..fa4e92682 100644 --- a/lapack-netlib/SRC/cggevx.f +++ b/lapack-netlib/SRC/cggevx.f @@ -335,7 +335,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggevx * *> \par Further Details: * ===================== @@ -416,13 +416,13 @@ * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY, $ CLASCL, CLASET, CTGEVC, CTGSNA, CUNGQR, CUNMQR, - $ SLABAD, SLASCL, XERBLA + $ SLASCL, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -521,7 +521,7 @@ $ N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, 0 ) ) END IF END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -25 @@ -545,7 +545,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -793,7 +792,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGGEVX diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index fb384b651..0d36deca6 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -177,7 +177,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup ggglm * * ===================================================================== SUBROUTINE CGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK, @@ -213,7 +213,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -252,7 +253,7 @@ LWKMIN = M + N + P LWKOPT = M + NP + MAX( N, P )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f index 1074b4828..f7175a72c 100644 --- a/lapack-netlib/SRC/cgghd3.f +++ b/lapack-netlib/SRC/cgghd3.f @@ -180,14 +180,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -212,7 +212,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -265,7 +265,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGGHRD, CLARTG, CLASET, CUNM22, CROT, CGEMM, @@ -280,8 +281,13 @@ * INFO = 0 NB = ILAENV( 1, 'CGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) - WORK( 1 ) = CMPLX( LWKOPT ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) INITZ = LSAME( COMPZ, 'I' ) @@ -330,7 +336,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -888,7 +893,8 @@ IF ( JCOL.LT.IHI ) $ CALL CGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) - WORK( 1 ) = CMPLX( LWKOPT ) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgglse.f b/lapack-netlib/SRC/cgglse.f index cca20dfed..b1c562385 100644 --- a/lapack-netlib/SRC/cgglse.f +++ b/lapack-netlib/SRC/cgglse.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERsolve +*> \ingroup gglse * * ===================================================================== SUBROUTINE CGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK, @@ -207,7 +207,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -246,7 +247,7 @@ LWKMIN = M + N + P LWKOPT = P + MN + MAX( M, N )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index 0185f4e0d..309f170e8 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -236,7 +236,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -250,8 +251,8 @@ NB2 = ILAENV( 1, 'CGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -287,7 +288,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL CGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index 5227100da..8470a1ce2 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -235,7 +235,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -249,8 +250,8 @@ NB2 = ILAENV( 1, 'CGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -287,7 +288,7 @@ * QR factorization of P-by-N matrix B: B = Z*T * CALL CGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggsvd3.f b/lapack-netlib/SRC/cggsvd3.f index f248aebd5..4c4b85bae 100644 --- a/lapack-netlib/SRC/cggsvd3.f +++ b/lapack-netlib/SRC/cggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -333,7 +333,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/cggsvp3.f b/lapack-netlib/SRC/cggsvp3.f index 008a053a2..e19f7efd5 100644 --- a/lapack-netlib/SRC/cggsvp3.f +++ b/lapack-netlib/SRC/cggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f index 37853a10a..7426ecdf1 100644 --- a/lapack-netlib/SRC/cgsvj0.f +++ b/lapack-netlib/SRC/cgsvj0.f @@ -52,10 +52,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f index f1a5204e3..18dd690cc 100644 --- a/lapack-netlib/SRC/cgsvj1.f +++ b/lapack-netlib/SRC/cgsvj1.f @@ -75,10 +75,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/chbev_2stage.f b/lapack-netlib/SRC/chbev_2stage.f index 123d84729..f84d8d3d4 100644 --- a/lapack-netlib/SRC/chbev_2stage.f +++ b/lapack-netlib/SRC/chbev_2stage.f @@ -132,7 +132,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS @@ -171,7 +171,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hbev_2stage * *> \par Further Details: * ===================== @@ -189,7 +189,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -197,11 +197,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -240,8 +240,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, CLANHB - EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV2STAGE + REAL SLAMCH, CLANHB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR, @@ -276,7 +277,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', JOBZ, $ N, KD, -1, -1 ) @@ -285,7 +286,7 @@ LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ, $ N, KD, IB, -1 ) LWMIN = LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ENDIF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -351,7 +352,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, - $ RWORK( INDE ), WORK( INDHOUS ), LHTRD, + $ RWORK( INDE ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, call CSTEQR. @@ -377,7 +378,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/chbevd.f b/lapack-netlib/SRC/chbevd.f index 1598f4de5..a5afe6b76 100644 --- a/lapack-netlib/SRC/chbevd.f +++ b/lapack-netlib/SRC/chbevd.f @@ -41,12 +41,6 @@ *> a complex Hermitian band matrix A. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -207,7 +201,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hbevd * * ===================================================================== SUBROUTINE CHBEVD( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, WORK, @@ -245,8 +239,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL CLANHB, SLAMCH - EXTERNAL LSAME, CLANHB, SLAMCH + REAL CLANHB, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANHB, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMM, CHBTRD, CLACPY, CLASCL, CSTEDC, SSCAL, @@ -294,7 +288,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -388,7 +382,7 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN RETURN diff --git a/lapack-netlib/SRC/chbevd_2stage.f b/lapack-netlib/SRC/chbevd_2stage.f index 340c546e8..3c9c8ecc0 100644 --- a/lapack-netlib/SRC/chbevd_2stage.f +++ b/lapack-netlib/SRC/chbevd_2stage.f @@ -47,12 +47,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/chbevx_2stage.f b/lapack-netlib/SRC/chbevx_2stage.f index 22bced45f..1d609dfbd 100644 --- a/lapack-netlib/SRC/chbevx_2stage.f +++ b/lapack-netlib/SRC/chbevx_2stage.f @@ -22,7 +22,7 @@ * * SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, * Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W, -* Z, LDZ, WORK, LWORK, RWORK, IWORK, +* Z, LDZ, WORK, LWORK, RWORK, IWORK, * IFAIL, INFO ) * * IMPLICIT NONE @@ -233,7 +233,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS @@ -285,7 +285,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hbevx_2stage * *> \par Further Details: * ===================== @@ -303,7 +303,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -311,18 +311,18 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, $ Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W, - $ Z, LDZ, WORK, LWORK, RWORK, IWORK, + $ Z, LDZ, WORK, LWORK, RWORK, IWORK, $ IFAIL, INFO ) * IMPLICIT NONE @@ -367,8 +367,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, CLANHB - EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV2STAGE + REAL SLAMCH, CLANHB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CCOPY, @@ -424,16 +425,16 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', JOBZ, $ N, KD, -1, -1 ) - LHTRD = ILAENV2STAGE( 3, 'CHETRD_HB2ST', JOBZ, + LHTRD = ILAENV2STAGE( 3, 'CHETRD_HB2ST', JOBZ, $ N, KD, IB, -1 ) - LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ, + LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ, $ N, KD, IB, -1 ) LWMIN = LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ENDIF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -637,7 +638,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/chbgvd.f b/lapack-netlib/SRC/chbgvd.f index c4ad20753..00fb2b5f5 100644 --- a/lapack-netlib/SRC/chbgvd.f +++ b/lapack-netlib/SRC/chbgvd.f @@ -46,12 +46,6 @@ *> and banded, and B is also positive definite. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -238,7 +232,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hbgvd * *> \par Contributors: * ================== @@ -281,7 +275,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSTERF, XERBLA, CGEMM, CHBGST, CHBTRD, CLACPY, @@ -328,7 +323,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -394,7 +389,7 @@ CALL CLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ ) END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN RETURN diff --git a/lapack-netlib/SRC/cheev.f b/lapack-netlib/SRC/cheev.f index fb8e451df..60df7d8b8 100644 --- a/lapack-netlib/SRC/cheev.f +++ b/lapack-netlib/SRC/cheev.f @@ -132,7 +132,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heev * * ===================================================================== SUBROUTINE CHEEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, @@ -169,8 +169,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANHE, SLAMCH - EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH + REAL CLANHE, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRD, CLASCL, CSTEQR, CUNGTR, SSCAL, SSTERF, @@ -201,7 +201,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, ( NB+1 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, 2*N-1 ) .AND. .NOT.LQUERY ) $ INFO = -8 @@ -286,7 +286,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cheev_2stage.f b/lapack-netlib/SRC/cheev_2stage.f index fb7989d9f..4e1cecc64 100644 --- a/lapack-netlib/SRC/cheev_2stage.f +++ b/lapack-netlib/SRC/cheev_2stage.f @@ -106,12 +106,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -149,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -167,7 +167,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -175,11 +175,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -220,8 +220,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, CLANHE - EXTERNAL LSAME, SLAMCH, CLANHE, ILAENV2STAGE + REAL SLAMCH, CLANHE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANHE, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR, @@ -255,7 +256,7 @@ LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWMIN = N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -8 @@ -314,7 +315,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, first call @@ -343,7 +344,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index 2ddf74b98..9b62a2df6 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -41,12 +41,6 @@ *> complex Hermitian matrix A. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -122,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is REAL array, -*> dimension (LRWORK) +*> RWORK is REAL array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -186,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevd * *> \par Further Details: * ===================== @@ -236,8 +229,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANHE, SLAMCH - EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH + REAL CLANHE, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRD, CLACPY, CLASCL, CSTEDC, CUNMTR, SSCAL, @@ -288,8 +281,8 @@ LROPT = LRWMIN LIOPT = LIWMIN END IF - WORK( 1 ) = LOPT - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -384,8 +377,8 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = LOPT - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/cheevd_2stage.f b/lapack-netlib/SRC/cheevd_2stage.f index 830e13d30..a0e8843ae 100644 --- a/lapack-netlib/SRC/cheevd_2stage.f +++ b/lapack-netlib/SRC/cheevd_2stage.f @@ -46,12 +46,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f index b8854b182..ad5c8cd4a 100644 --- a/lapack-netlib/SRC/cheevr.f +++ b/lapack-netlib/SRC/cheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for CHETRD and for *> CUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -338,7 +341,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -392,8 +395,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANSY, SLAMCH - EXTERNAL LSAME, ILAENV, CLANSY, SLAMCH + REAL CLANSY, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANSY, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRD, CSSCAL, CSTEMR, CSTEIN, CSWAP, CUNMTR, @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,8 +463,8 @@ NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -710,8 +719,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f index 0332a09bc..e06925fcd 100644 --- a/lapack-netlib/SRC/cheevr_2stage.f +++ b/lapack-netlib/SRC/cheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by CUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -443,8 +446,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ILAENV2STAGE - REAL SLAMCH, CLANSY - EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE + REAL SLAMCH, CLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -472,9 +476,16 @@ IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -506,8 +517,8 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -535,7 +546,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -643,9 +654,9 @@ * * Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired @@ -666,7 +677,7 @@ CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 ) * - IF (ABSTOL .LE. TWO*N*EPS) THEN + IF ( ABSTOL .LE. TWO*N*EPS ) THEN TRYRAC = .TRUE. ELSE TRYRAC = .FALSE. @@ -765,8 +776,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f index 1cec902aa..a8a2bde63 100644 --- a/lapack-netlib/SRC/cheevx.f +++ b/lapack-netlib/SRC/cheevx.f @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE CHEEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -294,8 +294,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANHE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE + REAL SLAMCH, CLANHE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -348,14 +348,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 2*N NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) - LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = LWKOPT + LWKOPT = ( NB + 1 )*N END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 @@ -552,7 +552,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cheevx_2stage.f b/lapack-netlib/SRC/cheevx_2stage.f index 04f6e30e8..70a681ec4 100644 --- a/lapack-netlib/SRC/cheevx_2stage.f +++ b/lapack-netlib/SRC/cheevx_2stage.f @@ -209,12 +209,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 8*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -265,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevx_2stage * *> \par Further Details: * ===================== @@ -283,7 +283,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -291,11 +291,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -335,7 +335,7 @@ CHARACTER ORDER INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, $ INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE, - $ ITMP1, J, JJ, LLWORK, + $ ITMP1, J, JJ, LLWORK, $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, $ SIGMA, SMLNUM, TMP1, VLL, VUU @@ -343,8 +343,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, CLANHE - EXTERNAL LSAME, SLAMCH, CLANHE, ILAENV2STAGE + REAL SLAMCH, CLANHE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANHE, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -397,7 +398,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE KD = ILAENV2STAGE( 1, 'CHETRD_2STAGE', JOBZ, $ N, -1, -1, -1 ) @@ -408,7 +409,7 @@ LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, $ N, KD, IB, -1 ) LWMIN = N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) END IF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -499,7 +500,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ), - $ RWORK( INDE ), WORK( INDTAU ), + $ RWORK( INDE ), WORK( INDTAU ), $ WORK( INDHOUS ), LHTRD, WORK( INDWRK ), $ LLWORK, IINFO ) * @@ -610,7 +611,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/chegv.f b/lapack-netlib/SRC/chegv.f index f7675a19f..53f9d5196 100644 --- a/lapack-netlib/SRC/chegv.f +++ b/lapack-netlib/SRC/chegv.f @@ -160,7 +160,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup hegv * * ===================================================================== SUBROUTINE CHEGV( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, @@ -206,7 +206,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHEEV, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA @@ -240,7 +241,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, 2*N-1 ) .AND. .NOT.LQUERY ) THEN INFO = -11 @@ -309,7 +310,7 @@ END IF END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chegv_2stage.f b/lapack-netlib/SRC/chegv_2stage.f index 472581c4b..8de1f7f06 100644 --- a/lapack-netlib/SRC/chegv_2stage.f +++ b/lapack-netlib/SRC/chegv_2stage.f @@ -144,12 +144,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -179,7 +179,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -192,7 +192,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup hegv_2stage * *> \par Further Details: * ===================== @@ -210,7 +210,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -218,11 +218,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -259,7 +259,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHEGST, CPOTRF, CTRMM, CTRSM, @@ -297,7 +298,7 @@ LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWMIN = N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 @@ -327,7 +328,7 @@ * Transform problem to standard eigenvalue problem and solve. * CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) - CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, + CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, $ WORK, LWORK, RWORK, INFO ) * IF( WANTZ ) THEN @@ -367,7 +368,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f index 4b7f43d52..d2dc941e6 100644 --- a/lapack-netlib/SRC/chegvd.f +++ b/lapack-netlib/SRC/chegvd.f @@ -43,12 +43,6 @@ *> B are assumed to be Hermitian and B is also positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -212,7 +206,7 @@ *> the submatrix lying in rows and columns INFO/(N+1) *> through mod(INFO,N+1); *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -225,7 +219,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup hegvd * *> \par Further Details: * ===================== @@ -274,7 +268,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHEEVD, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA @@ -322,7 +317,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) RWORK( 1 ) = LROPT IWORK( 1 ) = LIOPT * @@ -398,7 +393,7 @@ END IF END IF * - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) RWORK( 1 ) = LROPT IWORK( 1 ) = LIOPT * diff --git a/lapack-netlib/SRC/chegvx.f b/lapack-netlib/SRC/chegvx.f index 6e428242d..172d0571e 100644 --- a/lapack-netlib/SRC/chegvx.f +++ b/lapack-netlib/SRC/chegvx.f @@ -280,7 +280,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -293,7 +293,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup hegvx * *> \par Contributors: * ================== @@ -335,7 +335,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHEEVX, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA @@ -390,7 +391,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -464,7 +465,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesv.f b/lapack-netlib/SRC/chesv.f index 238fb0a94..cea1235b7 100644 --- a/lapack-netlib/SRC/chesv.f +++ b/lapack-netlib/SRC/chesv.f @@ -163,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesv * * ===================================================================== SUBROUTINE CHESV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -191,7 +191,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHETRF, CHETRS, CHETRS2 @@ -226,7 +227,7 @@ NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -259,7 +260,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index c9b97e09c..0f41c9332 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE CHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,12 +177,13 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHETRF_AA, CHETRS_AA @@ -196,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) - WORK( 1 ) = LWKOPT + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -239,7 +241,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index e474c6fad..05ebd9253 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -87,7 +87,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -99,14 +99,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -146,14 +146,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -177,7 +178,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE CHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -203,11 +204,12 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRF_AA_2STAGE, CHETRS_AA_2STAGE, @@ -224,6 +226,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -232,18 +235,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -253,7 +257,6 @@ RETURN END IF * -* * Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -267,7 +270,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesv_rk.f b/lapack-netlib/SRC/chesv_rk.f index e123fa299..268a55e23 100644 --- a/lapack-netlib/SRC/chesv_rk.f +++ b/lapack-netlib/SRC/chesv_rk.f @@ -205,7 +205,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesv_rk * *> \par Contributors: * ================== @@ -247,7 +247,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHETRF_RK, CHETRS_3 @@ -282,7 +283,7 @@ CALL CHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -304,7 +305,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesv_rook.f b/lapack-netlib/SRC/chesv_rook.f index 8e0b1a88f..2a0d3fdaf 100644 --- a/lapack-netlib/SRC/chesv_rook.f +++ b/lapack-netlib/SRC/chesv_rook.f @@ -184,7 +184,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesv_rook *> *> \verbatim *> @@ -225,7 +225,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHETRF_ROOK, CHETRS_ROOK @@ -260,7 +261,7 @@ NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -283,7 +284,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f index 6da49bdcf..bdaad55ec 100644 --- a/lapack-netlib/SRC/chesvx.f +++ b/lapack-netlib/SRC/chesvx.f @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE CHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -307,14 +307,14 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANHE, SLAMCH - EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH + REAL CLANHE, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHECON, CHERFS, CHETRF, CHETRS, CLACPY, XERBLA @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,17 +347,17 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -405,7 +406,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetf2_rk.f b/lapack-netlib/SRC/chetf2_rk.f index 4bb032382..e687ec64a 100644 --- a/lapack-netlib/SRC/chetf2_rk.f +++ b/lapack-netlib/SRC/chetf2_rk.f @@ -480,7 +480,7 @@ A( J, K ) = CONJG( A( P, J ) ) A( P, J ) = T 14 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = CONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( K, K ) ) @@ -508,7 +508,7 @@ A( J, KK ) = CONJG( A( KP, J ) ) A( KP, J ) = T 15 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = CONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( KK, KK ) ) @@ -834,7 +834,7 @@ A( J, K ) = CONJG( A( P, J ) ) A( P, J ) = T 44 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = CONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( K, K ) ) @@ -862,7 +862,7 @@ A( J, KK ) = CONJG( A( KP, J ) ) A( KP, J ) = T 45 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = CONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( KK, KK ) ) diff --git a/lapack-netlib/SRC/chetf2_rook.f b/lapack-netlib/SRC/chetf2_rook.f index ee4eaf68f..49fba1bda 100644 --- a/lapack-netlib/SRC/chetf2_rook.f +++ b/lapack-netlib/SRC/chetf2_rook.f @@ -420,7 +420,7 @@ A( J, K ) = CONJG( A( P, J ) ) A( P, J ) = T 14 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = CONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( K, K ) ) @@ -441,7 +441,7 @@ A( J, KK ) = CONJG( A( KP, J ) ) A( KP, J ) = T 15 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = CONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( KK, KK ) ) @@ -733,7 +733,7 @@ A( J, K ) = CONJG( A( P, J ) ) A( P, J ) = T 44 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = CONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( K, K ) ) @@ -754,7 +754,7 @@ A( J, KK ) = CONJG( A( KP, J ) ) A( KP, J ) = T 45 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = CONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( KK, KK ) ) diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f index f5ad35f27..ec7075798 100644 --- a/lapack-netlib/SRC/chetrd_2stage.f +++ b/lapack-netlib/SRC/chetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download CHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download CHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX array, dimension (LHOUS2) +*> HOUS2 is COMPLEX array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2=-1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,13 +145,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -157,9 +162,9 @@ *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -250,7 +255,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -265,10 +271,13 @@ * KD = ILAENV2STAGE( 1, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -285,8 +294,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + HOUS2( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -309,14 +318,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN END IF - CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HB2ST', -INFO ) @@ -324,8 +333,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_2STAGE diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index e047ab720..b0d3e45fb 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the chetrd_he2hb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the chetrd_he2hb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the chetrd_he2hb *> routine has been called to produce AB (e.g., AB is *> the output of chetrd_he2hb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension LWORK. +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * @@ -259,11 +264,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SICEV, SIZETAU, LDV, LHMIN, LWMIN REAL ABSTMP @@ -277,15 +282,15 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + INTEGER ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -294,9 +299,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -317,8 +327,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = LWMIN + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -358,7 +368,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -366,11 +376,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* complex because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -385,17 +395,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = REAL( AB( ABDPOS, I ) ) @@ -444,7 +454,7 @@ C END IF RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the hermitian band of A to a tridiagonal matrix. * THGRSIZ = N @@ -453,7 +463,7 @@ C END IF NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL CLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -462,7 +472,7 @@ C END IF * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -471,7 +481,7 @@ C END IF #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -482,7 +492,7 @@ C END IF ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -508,16 +518,16 @@ C END IF ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -525,20 +535,20 @@ C END IF !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, - $ WORK( INDW + TID*KD ) ) -#endif + $ WORK( INDW ) ) +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -553,14 +563,14 @@ C END IF !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -574,11 +584,10 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HB2ST * END - + diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index 904555c10..42e71e0b2 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -283,7 +285,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -293,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -313,7 +320,7 @@ CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -506,7 +513,7 @@ END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HE2HB diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f index 484e76256..2836e30bc 100644 --- a/lapack-netlib/SRC/chetrf.f +++ b/lapack-netlib/SRC/chetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -197,7 +197,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETF2, CLAHEF, XERBLA @@ -227,8 +228,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -346,7 +347,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f index 54567b8cd..51410a6ed 100644 --- a/lapack-netlib/SRC/chetrf_aa.f +++ b/lapack-netlib/SRC/chetrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= 2*N. For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,14 +154,15 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX ALPHA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAHEF_AA, CGEMM, CCOPY, CSWAP, CSCAL, XERBLA @@ -178,19 +181,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.( 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -202,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = REAL( A( 1, 1 ) ) RETURN END IF @@ -459,7 +469,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_AA diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 20a9859e8..a79343753 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -188,7 +188,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. @@ -213,9 +214,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -229,10 +230,10 @@ NB = ILAENV( 1, 'CHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -241,7 +242,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f index 3f60f4370..a13c740e3 100644 --- a/lapack-netlib/SRC/chetrf_rk.f +++ b/lapack-netlib/SRC/chetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -280,7 +280,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAHEF_RK, CHETF2_RK, CSWAP, XERBLA @@ -310,8 +311,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -487,7 +488,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_RK diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f index 805e0f4cb..df0323520 100644 --- a/lapack-netlib/SRC/chetrf_rook.f +++ b/lapack-netlib/SRC/chetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== @@ -232,7 +232,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAHEF_ROOK, CHETF2_ROOK, XERBLA @@ -263,7 +264,7 @@ * NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -386,7 +387,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_ROOK diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f index 2865a6440..f15065ae7 100644 --- a/lapack-netlib/SRC/chetri2.f +++ b/lapack-netlib/SRC/chetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE CHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRI2X, CHETRI, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL CHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL CHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of CHETRI2 diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f index cc2318b22..ccfce5070 100644 --- a/lapack-netlib/SRC/chetri_3.f +++ b/lapack-netlib/SRC/chetri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -152,7 +153,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -190,7 +191,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRI_3X, XERBLA @@ -208,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -217,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -225,7 +232,6 @@ CALL XERBLA( 'CHETRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT RETURN END IF * @@ -236,7 +242,7 @@ * CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f index 2546adb2d..07179ab92 100644 --- a/lapack-netlib/SRC/chetrs_aa.f +++ b/lapack-netlib/SRC/chetrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -123,7 +129,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -151,23 +157,30 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -178,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/chpevd.f b/lapack-netlib/SRC/chpevd.f index c44462394..2449783a2 100644 --- a/lapack-netlib/SRC/chpevd.f +++ b/lapack-netlib/SRC/chpevd.f @@ -41,12 +41,6 @@ *> a complex Hermitian matrix A in packed storage. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -192,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hpevd * * ===================================================================== SUBROUTINE CHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, @@ -229,8 +223,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL CLANHP, SLAMCH - EXTERNAL LSAME, CLANHP, SLAMCH + REAL CLANHP, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANHP, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHPTRD, CSSCAL, CSTEDC, CUPMTR, SSCAL, SSTERF, @@ -274,7 +268,7 @@ LIWMIN = 1 END IF END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -365,7 +359,7 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN RETURN diff --git a/lapack-netlib/SRC/chpgv.f b/lapack-netlib/SRC/chpgv.f index 417f10121..660724e05 100644 --- a/lapack-netlib/SRC/chpgv.f +++ b/lapack-netlib/SRC/chpgv.f @@ -144,7 +144,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not convergeto zero; *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f index 65d08b783..57ac4fc72 100644 --- a/lapack-netlib/SRC/chpgvd.f +++ b/lapack-netlib/SRC/chpgvd.f @@ -44,12 +44,6 @@ *> positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -205,7 +199,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not convergeto zero; *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -218,7 +212,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hpgvd * *> \par Contributors: * ================== @@ -252,7 +246,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHPEVD, CHPGST, CPPTRF, CTPMV, CTPSV, XERBLA @@ -298,7 +293,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -380,7 +375,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN RETURN diff --git a/lapack-netlib/SRC/chpgvx.f b/lapack-netlib/SRC/chpgvx.f index 711daf55f..2646800cc 100644 --- a/lapack-netlib/SRC/chpgvx.f +++ b/lapack-netlib/SRC/chpgvx.f @@ -250,7 +250,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f index 007f72f59..56ff01fc6 100644 --- a/lapack-netlib/SRC/chseqr.f +++ b/lapack-netlib/SRC/chseqr.f @@ -216,7 +216,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup hseqr * *> \par Contributors: * ================== @@ -343,7 +343,8 @@ * .. External Functions .. INTEGER ILAENV LOGICAL LSAME - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CCOPY, CLACPY, CLAHQR, CLAQR0, CLASET, XERBLA diff --git a/lapack-netlib/SRC/cla_gbrfsx_extended.f b/lapack-netlib/SRC/cla_gbrfsx_extended.f index 6d43c8325..e8b96fd6e 100644 --- a/lapack-netlib/SRC/cla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/cla_gbrfsx_extended.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU, +* SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU, * NRHS, AB, LDAB, AFB, LDAFB, IPIV, * COLEQU, C, B, LDB, Y, LDY, * BERR_OUT, N_NORMS, ERR_BNDS_NORM, @@ -400,7 +400,7 @@ *> \ingroup complexGBcomputational * * ===================================================================== - SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU, + SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU, $ NRHS, AB, LDAB, AFB, LDAFB, IPIV, $ COLEQU, C, B, LDB, Y, LDY, $ BERR_OUT, N_NORMS, ERR_BNDS_NORM, @@ -651,7 +651,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/cla_gerfsx_extended.f b/lapack-netlib/SRC/cla_gerfsx_extended.f index 4b1031101..e524db5ad 100644 --- a/lapack-netlib/SRC/cla_gerfsx_extended.f +++ b/lapack-netlib/SRC/cla_gerfsx_extended.f @@ -637,7 +637,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/cla_herfsx_extended.f b/lapack-netlib/SRC/cla_herfsx_extended.f index 6d007ef58..19f845692 100644 --- a/lapack-netlib/SRC/cla_herfsx_extended.f +++ b/lapack-netlib/SRC/cla_herfsx_extended.f @@ -654,7 +654,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/cla_porfsx_extended.f b/lapack-netlib/SRC/cla_porfsx_extended.f index 9ced9b1b9..2dafecb35 100644 --- a/lapack-netlib/SRC/cla_porfsx_extended.f +++ b/lapack-netlib/SRC/cla_porfsx_extended.f @@ -625,7 +625,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/cla_porpvgrw.f b/lapack-netlib/SRC/cla_porpvgrw.f index 1eb706d1a..78cd19da9 100644 --- a/lapack-netlib/SRC/cla_porpvgrw.f +++ b/lapack-netlib/SRC/cla_porpvgrw.f @@ -140,9 +140,9 @@ * .. Executable Statements .. UPPER = LSAME( 'Upper', UPLO ) * -* SPOTRF will have factored only the NCOLSxNCOLS leading minor, so -* we restrict the growth search to that minor and use only the first -* 2*NCOLS workspace entries. +* SPOTRF will have factored only the NCOLSxNCOLS leading submatrix, +* so we restrict the growth search to that submatrix and use only +* the first 2*NCOLS workspace entries. * RPVGRW = 1.0 DO I = 1, 2*NCOLS diff --git a/lapack-netlib/SRC/cla_syrfsx_extended.f b/lapack-netlib/SRC/cla_syrfsx_extended.f index 4fe538a98..95f969731 100644 --- a/lapack-netlib/SRC/cla_syrfsx_extended.f +++ b/lapack-netlib/SRC/cla_syrfsx_extended.f @@ -654,7 +654,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/claed7.f b/lapack-netlib/SRC/claed7.f index 9d2f97141..72f1417d4 100644 --- a/lapack-netlib/SRC/claed7.f +++ b/lapack-netlib/SRC/claed7.f @@ -363,7 +363,7 @@ RETURN END IF * -* Prepare the INDXQ sorting premutation. +* Prepare the INDXQ sorting permutation. * N1 = K N2 = N - K diff --git a/lapack-netlib/SRC/claed8.f b/lapack-netlib/SRC/claed8.f index c15a0365f..1600087ab 100644 --- a/lapack-netlib/SRC/claed8.f +++ b/lapack-netlib/SRC/claed8.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA, +* SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA, * Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR, * GIVCOL, GIVNUM, INFO ) * @@ -29,7 +29,7 @@ * .. Array Arguments .. * INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), * $ INDXQ( * ), PERM( * ) -* REAL D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ), +* REAL D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ), * $ Z( * ) * COMPLEX Q( LDQ, * ), Q2( LDQ2, * ) * .. @@ -122,9 +122,9 @@ *> destroyed during the updating process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (N) +*> DLAMBDA is REAL array, dimension (N) *> Contains a copy of the first K eigenvalues which will be used *> by SLAED3 to form the secular equation. *> \endverbatim @@ -222,7 +222,7 @@ *> \ingroup complexOTHERcomputational * * ===================================================================== - SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA, + SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA, $ Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR, $ GIVCOL, GIVNUM, INFO ) * @@ -237,7 +237,7 @@ * .. Array Arguments .. INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), $ INDXQ( * ), PERM( * ) - REAL D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ), + REAL D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ), $ Z( * ) COMPLEX Q( LDQ, * ), Q2( LDQ2, * ) * .. @@ -322,14 +322,14 @@ INDXQ( I ) = INDXQ( I ) + CUTPNT 20 CONTINUE DO 30 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) W( I ) = Z( INDXQ( I ) ) 30 CONTINUE I = 1 J = CUTPNT + 1 - CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX ) + CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX ) DO 40 I = 1, N - D( I ) = DLAMDA( INDX( I ) ) + D( I ) = DLAMBDA( INDX( I ) ) Z( I ) = W( INDX( I ) ) 40 CONTINUE * @@ -438,7 +438,7 @@ ELSE K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM JLAM = J END IF @@ -450,19 +450,19 @@ * K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM * 100 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * DO 110 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) CALL CCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 ) 110 CONTINUE @@ -471,7 +471,7 @@ * into the last N - K slots of D and Q respectively. * IF( K.LT.N ) THEN - CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) CALL CLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ), $ LDQ ) END IF diff --git a/lapack-netlib/SRC/clals0.f b/lapack-netlib/SRC/clals0.f index e981fc36f..0b545d5d7 100644 --- a/lapack-netlib/SRC/clals0.f +++ b/lapack-netlib/SRC/clals0.f @@ -392,6 +392,11 @@ $ ( POLES( I, 2 ).EQ.ZERO ) ) THEN RWORK( I ) = ZERO ELSE +* +* Use calls to the subroutine SLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* RWORK( I ) = POLES( I, 2 )*Z( I ) / $ ( SLAMC3( POLES( I, 2 ), DSIGJ )- $ DIFLJ ) / ( POLES( I, 2 )+DJ ) @@ -470,6 +475,11 @@ IF( Z( J ).EQ.ZERO ) THEN RWORK( I ) = ZERO ELSE +* +* Use calls to the subroutine SLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent optimizing +* compilers from doing x+(y+z). +* RWORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1, $ 2 ) )-DIFR( I, 1 ) ) / $ ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 ) diff --git a/lapack-netlib/SRC/clalsa.f b/lapack-netlib/SRC/clalsa.f index 06883ab20..fceb380f5 100644 --- a/lapack-netlib/SRC/clalsa.f +++ b/lapack-netlib/SRC/clalsa.f @@ -42,9 +42,9 @@ *> *> \verbatim *> -*> CLALSA is an itermediate step in solving the least squares problem +*> CLALSA is an intermediate step in solving the least squares problem *> by computing the SVD of the coefficient matrix in compact form (The -*> singular vectors are computed as products of simple orthorgonal +*> singular vectors are computed as products of simple orthogonal *> matrices.). *> *> If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector diff --git a/lapack-netlib/SRC/clalsd.f b/lapack-netlib/SRC/clalsd.f index a2da9a925..bdd6b31c5 100644 --- a/lapack-netlib/SRC/clalsd.f +++ b/lapack-netlib/SRC/clalsd.f @@ -48,12 +48,6 @@ *> problem; in this case a minimum norm solution is returned. *> The actual singular values are returned in D in ascending order. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f index 1606cc611..8f474a3ab 100644 --- a/lapack-netlib/SRC/clamswlq.f +++ b/lapack-netlib/SRC/clamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,92 +192,104 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CTPMLQT, CGEMLQT, XERBLA + EXTERNAL CTPMLQT, CGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL CGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -401,7 +416,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMSWLQ diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f index 5677420ac..13625087f 100644 --- a/lapack-netlib/SRC/clamtsqr.f +++ b/lapack-netlib/SRC/clamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,46 +193,51 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CGEMQRT, CTPMQRT, XERBLA + EXTERNAL CGEMQRT, CTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +245,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +266,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL CGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -409,7 +422,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMTSQR diff --git a/lapack-netlib/SRC/claqp2rk.c b/lapack-netlib/SRC/claqp2rk.c new file mode 100644 index 000000000..4184c5927 --- /dev/null +++ b/lapack-netlib/SRC/claqp2rk.c @@ -0,0 +1,943 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + i__3 = j; + tau[i__3].r = 0.f, tau[i__3].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = CZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + clarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + i__2 = kk; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since CLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by CLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by CLARFG is covered by checking TAU(KK) for NaN. */ + + i__2 = kk; + r__1 = tau[i__2].r; + if (sisnan_(&r__1)) { + i__2 = kk; + taunan = tau[i__2].r; + } else /* if(complicated condition) */ { + r__1 = r_imag(&tau[kk]); + if (sisnan_(&r__1)) { + taunan = r_imag(&tau[kk]); + } else { + taunan = 0.f; + } + } + + if (sisnan_(&taunan)) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + i__2 = i__ + kk * a_dim1; + aikk.r = a[i__2].r, aikk.i = a[i__2].i; + i__2 = i__ + kk * a_dim1; + a[i__2].r = 1.f, a[i__2].i = 0.f; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + r_cnjg(&q__1, &tau[kk]); + clarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &q__1, + &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + i__2 = i__ + kk * a_dim1; + a[i__2].r = aikk.r, a[i__2].i = aikk.i; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + r__1 = c_abs(&a[i__ + j * a_dim1]) / vn1[j]; + temp = 1.f - r__1 * r__1; + temp = f2cmax(temp,0.f); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = scnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + return 0; + +/* End of CLAQP2RK */ + +} /* claqp2rk_ */ + diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f new file mode 100644 index 000000000..6b1db085a --- /dev/null +++ b/lapack-netlib/SRC/claqp2rk.f @@ -0,0 +1,726 @@ +*> \brief \b CLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* REAL VN1( * ), VN2( * ) +* COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* $ +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of the complex matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine CGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine CGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the unitary matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (N-1) +*> Used in CLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + REAL VN1( * ), VN2( * ) + COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + REAL HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX AIKK +* .. +* .. External Subroutines .. + EXTERNAL CLARF, CLARFG, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SCNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SCNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL CSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = CZERO. +* + IF( I.LT.M ) THEN + CALL CLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = CZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since CLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by CLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by CLARFG is covered by checking TAU(KK) for NaN. +* + IF( SISNAN( REAL( TAU(KK) ) ) ) THEN + TAUNAN = REAL( TAU(KK) ) + ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN + TAUNAN = IMAG( TAU(KK) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( SISNAN( TAUNAN ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = CONE + CALL CLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ CONJG( TAU( KK ) ), A( I, KK+1 ), LDA, + $ WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = SCNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = CZERO + END DO +* + RETURN +* +* End of CLAQP2RK +* + END diff --git a/lapack-netlib/SRC/claqp3rk.c b/lapack-netlib/SRC/claqp3rk.c new file mode 100644 index 000000000..ca305fab7 --- /dev/null +++ b/lapack-netlib/SRC/claqp3rk.c @@ -0,0 +1,1152 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.f) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.f; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, & + i__2, kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[* + kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb + + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + cswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */ + + if (k > 1) { + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + r_cnjg(&q__1, &f[k + j * f_dim1]); + f[i__2].r = q__1.r, f[i__2].i = q__1.i; + } + i__1 = *m - i__ + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = 0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1); + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + r_cnjg(&q__1, &f[k + j * f_dim1]); + f[i__2].r = q__1.r, f[i__2].i = q__1.i; + } + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + clarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + i__1 = k; + tau[i__1].r = 0.f, tau[i__1].i = 0.f; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since CLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by CLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by CLARFG is covered by checking TAU(K) for NaN. */ + + i__1 = k; + r__1 = tau[i__1].r; + if (sisnan_(&r__1)) { + i__1 = k; + taunan = tau[i__1].r; + } else /* if(complicated condition) */ { + r__1 = r_imag(&tau[k]); + if (sisnan_(&r__1)) { + taunan = r_imag(&tau[k]); + } else { + taunan = 0.f; + } + } + + if (sisnan_(&taunan)) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb, + &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1], + lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + i__1 = i__ + k * a_dim1; + aik.r = a[i__1].r, aik.i = a[i__1].i; + i__1 = i__ + k * a_dim1; + a[i__1].r = 1.f, a[i__1].i = 0.f; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + cgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, & + f[k + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + i__2 = j + k * f_dim1; + f[i__2].r = 0.f, f[i__2].i = 0.f; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + i__3 = k; + q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i; + cgemv_("Conjugate Transpose", &i__1, &i__2, &q__1, &a[i__ + + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1] + , &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + cgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, & + q__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, & + c_b2, &a[i__ + (k + 1) * a_dim1], lda); + } + + i__1 = i__ + k * a_dim1; + a[i__1].r = aik.r, a[i__1].i = aik.i; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = c_abs(&a[i__ + j * a_dim1]) / vn1[j]; +/* Computing MAX */ + r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp); + temp = f2cmax(r__1,r__2); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &q__1, + &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2, + &a[if__ + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* SCNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(SLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = scnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of CLAQP3RK */ + +} /* claqp3rk_ */ + diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f new file mode 100644 index 000000000..3703bcbd6 --- /dev/null +++ b/lapack-netlib/SRC/claqp3rk.f @@ -0,0 +1,947 @@ +*> \brief \b CLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine CGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine CGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the unitary matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is COMPLEX array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is COMPLEX array, dimension (LDF,NB) +*> Matrix F**H = L*(Y**H)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL VN1( * ), VN2( * ) + COMPLEX A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + REAL HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX AIK +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGEMV, CLARFG, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SCNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SCNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL CSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL CSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. +* + IF( K.GT.1 ) THEN + DO J = 1, K - 1 + F( K, J ) = CONJG( F( K, J ) ) + END DO + CALL CGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 ) + DO J = 1, K - 1 + F( K, J ) = CONJG( F( K, J ) ) + END DO + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL CLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = CZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since CLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by CLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by CLARFG is covered by checking TAU(K) for NaN. +* + IF( SISNAN( REAL( TAU(K) ) ) ) THEN + TAUNAN = REAL( TAU(K) ) + ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN + TAUNAN = IMAG( TAU(K) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( SISNAN( TAUNAN ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = CONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL CGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ CZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = CZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL CGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, CZERO, + $ AUXV( 1 ), 1 ) +* + CALL CGEMV( 'No transpose', N+NRHS, K-1, CONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. +* + IF( K.LT.N+NRHS ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ 1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA, + $ F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* SCNRM2 does not fail on vectors with norm below the value of +* SQRT(SLAMCH('S')) +* + VN1( LSTICC ) = SCNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of CLAQP3RK +* + END diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f index 6de40e06c..c6cc5847d 100644 --- a/lapack-netlib/SRC/claqz0.f +++ b/lapack-netlib/SRC/claqz0.f @@ -89,7 +89,7 @@ *> Anal., 29(2006), pp. 199--227. *> *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift, -*> multipole rational QZ method with agressive early deflation" +*> multipole rational QZ method with aggressive early deflation" *> \endverbatim * * Arguments: @@ -310,7 +310,7 @@ CHARACTER :: JBCMPZ*3 * External Functions - EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD, + EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, $ CLARTG, CROT REAL, EXTERNAL :: SLAMCH, CLANHS LOGICAL, EXTERNAL :: LSAME @@ -462,7 +462,6 @@ * Get machine constants SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) @@ -533,7 +532,7 @@ DO WHILE ( K.GE.ISTART2 ) IF( ABS( B( K, K ) ) .LT. BTOL ) THEN -* A diagonal element of B is negligable, move it +* A diagonal element of B is negligible, move it * to the top and deflate it DO K2 = K, ISTART2+1, -1 diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f index ee6959ed8..5f042e345 100644 --- a/lapack-netlib/SRC/clarfb_gett.f +++ b/lapack-netlib/SRC/clarfb_gett.f @@ -452,7 +452,7 @@ IF( LNOTIDENT ) THEN * * col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, -* V1 is not an identy matrix, but unit lower-triangular +* V1 is not an identity matrix, but unit lower-triangular * V1 stored in A1 (diagonal ones are not stored). * * diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f index b584484c7..980e93612 100644 --- a/lapack-netlib/SRC/clarfgp.f +++ b/lapack-netlib/SRC/clarfgp.f @@ -97,7 +97,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERauxiliary +*> \ingroup larfgp * * ===================================================================== SUBROUTINE CLARFGP( N, ALPHA, X, INCX, TAU ) @@ -122,7 +122,7 @@ * .. * .. Local Scalars .. INTEGER J, KNT - REAL ALPHI, ALPHR, BETA, BIGNUM, SMLNUM, XNORM + REAL ALPHI, ALPHR, BETA, BIGNUM, EPS, SMLNUM, XNORM COMPLEX SAVEALPHA * .. * .. External Functions .. @@ -143,37 +143,28 @@ RETURN END IF * + EPS = SLAMCH( 'Precision' ) XNORM = SCNRM2( N-1, X, INCX ) ALPHR = REAL( ALPHA ) ALPHI = AIMAG( ALPHA ) * - IF( XNORM.EQ.ZERO ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = SLAPY2( ALPHR, ALPHI ) - TAU = CMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * diff --git a/lapack-netlib/SRC/classq.f90 b/lapack-netlib/SRC/classq.f90 index cb4e7971f..c5f793cc0 100644 --- a/lapack-netlib/SRC/classq.f90 +++ b/lapack-netlib/SRC/classq.f90 @@ -34,28 +34,15 @@ !> !> \verbatim !> -!> CLASSQ returns the values scl and smsq such that +!> CLASSQ returns the values scale_out and sumsq_out such that !> -!> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, +!> (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq, !> -!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is +!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is !> assumed to be non-negative. !> !> scale and sumsq must be supplied in SCALE and SUMSQ and -!> scl and smsq are overwritten on SCALE and SUMSQ respectively. -!> -!> If scale * sqrt( sumsq ) > tbig then -!> we require: scale >= sqrt( TINY*EPS ) / sbig on entry, -!> and if 0 < scale * sqrt( sumsq ) < tsml then -!> we require: scale <= sqrt( HUGE ) / ssml on entry, -!> where -!> tbig -- upper threshold for values whose square is representable; -!> sbig -- scaling constant for big numbers; \see la_constants.f90 -!> tsml -- lower threshold for values whose square is representable; -!> ssml -- scaling constant for small numbers; \see la_constants.f90 -!> and -!> TINY*EPS -- tiniest representable number; -!> HUGE -- biggest representable number. +!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively. !> !> \endverbatim ! @@ -72,7 +59,7 @@ !> \verbatim !> X is COMPLEX array, dimension (1+(N-1)*abs(INCX)) !> The vector for which a scaled sum of squares is computed. -!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. +!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. !> \endverbatim !> !> \param[in] INCX @@ -82,24 +69,24 @@ !> If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n !> If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n !> If INCX = 0, x isn't a vector so there is no need to call -!> this subroutine. If you call it anyway, it will count x(1) +!> this subroutine. If you call it anyway, it will count x(1) !> in the vector norm N times. !> \endverbatim !> !> \param[in,out] SCALE !> \verbatim !> SCALE is REAL -!> On entry, the value scale in the equation above. -!> On exit, SCALE is overwritten with scl , the scaling factor +!> On entry, the value scale in the equation above. +!> On exit, SCALE is overwritten by scale_out, the scaling factor !> for the sum of squares. !> \endverbatim !> !> \param[in,out] SUMSQ !> \verbatim !> SUMSQ is REAL -!> On entry, the value sumsq in the equation above. -!> On exit, SUMSQ is overwritten with smsq , the basic sum of -!> squares from which scl has been factored out. +!> On entry, the value sumsq in the equation above. +!> On exit, SUMSQ is overwritten by sumsq_out, the basic sum of +!> squares from which scale_out has been factored out. !> \endverbatim ! ! Authors: @@ -130,10 +117,10 @@ !> !> \endverbatim ! -!> \ingroup OTHERauxiliary +!> \ingroup lassq ! ! ===================================================================== -subroutine CLASSQ( n, x, incx, scl, sumsq ) +subroutine CLASSQ( n, x, incx, scale, sumsq ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, one=>sone, & sbig=>ssbig, ssml=>sssml, tbig=>stbig, tsml=>stsml @@ -145,7 +132,7 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) ! ! .. Scalar Arguments .. integer :: incx, n - real(wp) :: scl, sumsq + real(wp) :: scale, sumsq ! .. ! .. Array Arguments .. complex(wp) :: x(*) @@ -158,10 +145,10 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) ! ! Quick return if possible ! - if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return - if( sumsq == zero ) scl = one - if( scl == zero ) then - scl = one + if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return + if( sumsq == zero ) scale = one + if( scale == zero ) then + scale = one sumsq = zero end if if (n <= 0) then @@ -207,15 +194,27 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) ! Put the existing sum of squares into one of the accumulators ! if( sumsq > zero ) then - ax = scl*sqrt( sumsq ) + ax = scale*sqrt( sumsq ) if (ax > tbig) then -! We assume scl >= sqrt( TINY*EPS ) / sbig - abig = abig + (scl*sbig)**2 * sumsq + if (scale > one) then + scale = scale * sbig + abig = abig + scale * (scale * sumsq) + else + ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable + abig = abig + scale * (scale * (sbig * (sbig * sumsq))) + end if else if (ax < tsml) then -! We assume scl <= sqrt( HUGE ) / ssml - if (notbig) asml = asml + (scl*ssml)**2 * sumsq + if (notbig) then + if (scale < one) then + scale = scale * ssml + asml = asml + scale * (scale * sumsq) + else + ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable + asml = asml + scale * (scale * (ssml * (ssml * sumsq))) + end if + end if else - amed = amed + scl**2 * sumsq + amed = amed + scale * (scale * sumsq) end if end if ! @@ -229,7 +228,7 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) if (amed > zero .or. LA_ISNAN(amed)) then abig = abig + (amed*sbig)*sbig end if - scl = one / sbig + scale = one / sbig sumsq = abig else if (asml > zero) then ! @@ -245,17 +244,17 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) ymin = asml ymax = amed end if - scl = one + scale = one sumsq = ymax**2*( one + (ymin/ymax)**2 ) else - scl = one / ssml + scale = one / ssml sumsq = asml end if else ! ! Otherwise all values are mid-range or zero ! - scl = one + scale = one sumsq = amed end if return diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f index 63cbd02c6..2044e055c 100644 --- a/lapack-netlib/SRC/claswlq.f +++ b/lapack-netlib/SRC/claswlq.f @@ -96,21 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim -*> The dimension of the array WORK. LWORK >= MB*M. +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -158,40 +161,42 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME - EXTERNAL LSAME + INTEGER ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL CGELQT, CTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. -* .. EXTERNAL FUNCTIONS .. - INTEGER ILAENV - EXTERNAL ILAENV -* .. * .. EXECUTABLE STATEMENTS .. * * TEST THE INPUT ARGUMENTS @@ -199,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -212,60 +224,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN CALL CGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M , (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), $ LDA, T(1,CTR*M+1), $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL CTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), $ LDA, T(1,CTR*M+1), LDT, $ WORK, INFO ) - END IF + END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLASWLQ diff --git a/lapack-netlib/SRC/clatdf.f b/lapack-netlib/SRC/clatdf.f index 5445e387e..1e7d71669 100644 --- a/lapack-netlib/SRC/clatdf.f +++ b/lapack-netlib/SRC/clatdf.f @@ -227,7 +227,7 @@ BM = RHS( J ) - CONE SPLUS = ONE * -* Lockahead for L- part RHS(1:N-1) = +-1 +* Look-ahead for L- part RHS(1:N-1) = +-1 * SPLUS and SMIN computed more efficiently than in BSOLVE[1]. * SPLUS = SPLUS + REAL( CDOTC( N-J, Z( J+1, J ), 1, Z( J+1, diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f index a902f1ed0..354141a8b 100644 --- a/lapack-netlib/SRC/clatrs3.f +++ b/lapack-netlib/SRC/clatrs3.f @@ -152,13 +152,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,15 +262,16 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE, SLARMM - EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM + REAL SLAMCH, CLANGE, SLARMM, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATRS, CSSCAL, XERBLA @@ -296,15 +302,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -326,7 +341,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -577,7 +592,7 @@ * Prepare the linear update to be executed with GEMM. * For each column, compute a consistent scaling, a * scaling factor to survive the linear update, and -* rescale the column segments, if necesssary. Then +* rescale the column segments, if necessary. Then * the linear update is safely executed. * DO KK = 1, K2-K1 @@ -659,6 +674,9 @@ END IF END DO END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) +* RETURN * * End of CLATRS3 diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f index 7a0b85487..67403693f 100644 --- a/lapack-netlib/SRC/clatsqr.f +++ b/lapack-netlib/SRC/clatsqr.f @@ -101,14 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> The dimension of the array WORK. LWORK >= NB*N. +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -160,33 +164,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL CGEQRT, CTPQRT, XERBLA + EXTERNAL CGEQRT, CTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -197,6 +206,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -204,64 +220,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF ( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, $ T(1,CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + IF( II.LE.M ) THEN + CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLATSQR diff --git a/lapack-netlib/SRC/cpbsv.f b/lapack-netlib/SRC/cpbsv.f index 248abbc1f..889bbde08 100644 --- a/lapack-netlib/SRC/cpbsv.f +++ b/lapack-netlib/SRC/cpbsv.f @@ -119,9 +119,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cpbsvx.f b/lapack-netlib/SRC/cpbsvx.f index 652e18501..975c87768 100644 --- a/lapack-netlib/SRC/cpbsvx.f +++ b/lapack-netlib/SRC/cpbsvx.f @@ -70,7 +70,7 @@ *> where U is an upper triangular band matrix, and L is a lower *> triangular band matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -280,10 +280,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/cpbtf2.f b/lapack-netlib/SRC/cpbtf2.f index 0be2c0a7f..f5bc9b3a6 100644 --- a/lapack-netlib/SRC/cpbtf2.f +++ b/lapack-netlib/SRC/cpbtf2.f @@ -97,8 +97,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cpbtrf.f b/lapack-netlib/SRC/cpbtrf.f index a4c18efb3..af60780c8 100644 --- a/lapack-netlib/SRC/cpbtrf.f +++ b/lapack-netlib/SRC/cpbtrf.f @@ -92,8 +92,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cpftrf.f b/lapack-netlib/SRC/cpftrf.f index cbaab6832..12799c6f7 100644 --- a/lapack-netlib/SRC/cpftrf.f +++ b/lapack-netlib/SRC/cpftrf.f @@ -91,8 +91,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> *> Further Notes on RFP Format: diff --git a/lapack-netlib/SRC/cposv.f b/lapack-netlib/SRC/cposv.f index f37dfa3c0..ea6fc37db 100644 --- a/lapack-netlib/SRC/cposv.f +++ b/lapack-netlib/SRC/cposv.f @@ -110,9 +110,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cposvx.f b/lapack-netlib/SRC/cposvx.f index 78b9f4db1..322a26447 100644 --- a/lapack-netlib/SRC/cposvx.f +++ b/lapack-netlib/SRC/cposvx.f @@ -70,7 +70,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -276,10 +276,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/cposvxx.f b/lapack-netlib/SRC/cposvxx.f index 7834c75da..c40a2d856 100644 --- a/lapack-netlib/SRC/cposvxx.f +++ b/lapack-netlib/SRC/cposvxx.f @@ -87,7 +87,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A (see argument RCOND). If the reciprocal of the condition number diff --git a/lapack-netlib/SRC/cpotf2.f b/lapack-netlib/SRC/cpotf2.f index 2f4658bae..d84988949 100644 --- a/lapack-netlib/SRC/cpotf2.f +++ b/lapack-netlib/SRC/cpotf2.f @@ -89,8 +89,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cpotrf.f b/lapack-netlib/SRC/cpotrf.f index 6aba3103e..e2b120a49 100644 --- a/lapack-netlib/SRC/cpotrf.f +++ b/lapack-netlib/SRC/cpotrf.f @@ -87,8 +87,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cpotrf2.f b/lapack-netlib/SRC/cpotrf2.f index e1eae3e9d..ea2e4ca98 100644 --- a/lapack-netlib/SRC/cpotrf2.f +++ b/lapack-netlib/SRC/cpotrf2.f @@ -86,8 +86,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cppsv.f b/lapack-netlib/SRC/cppsv.f index 1e6f02695..a8fd660c4 100644 --- a/lapack-netlib/SRC/cppsv.f +++ b/lapack-netlib/SRC/cppsv.f @@ -104,9 +104,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cppsvx.f b/lapack-netlib/SRC/cppsvx.f index f6f07538c..2ef02100f 100644 --- a/lapack-netlib/SRC/cppsvx.f +++ b/lapack-netlib/SRC/cppsvx.f @@ -69,7 +69,7 @@ *> where U is an upper triangular matrix, L is a lower triangular *> matrix, and **H indicates conjugate transpose. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -262,10 +262,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/cpptrf.f b/lapack-netlib/SRC/cpptrf.f index 4e81458cb..e36f834cb 100644 --- a/lapack-netlib/SRC/cpptrf.f +++ b/lapack-netlib/SRC/cpptrf.f @@ -79,9 +79,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be -*> completed. +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive definite, and the factorization could +*> not be completed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cpteqr.f b/lapack-netlib/SRC/cpteqr.f index e3af59041..fc9c44908 100644 --- a/lapack-netlib/SRC/cpteqr.f +++ b/lapack-netlib/SRC/cpteqr.f @@ -123,8 +123,8 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, and i is: *> <= N the Cholesky factorization of the matrix could -*> not be performed because the i-th principal minor -*> was not positive definite. +*> not be performed because the leading principal +*> minor of order i was not positive. *> > N the SVD algorithm failed to converge; *> if INFO = N+i, i off-diagonal elements of the *> bidiagonal factor did not converge to zero. diff --git a/lapack-netlib/SRC/cptsv.f b/lapack-netlib/SRC/cptsv.f index 20ee32bbe..4c16f6a0a 100644 --- a/lapack-netlib/SRC/cptsv.f +++ b/lapack-netlib/SRC/cptsv.f @@ -94,8 +94,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the solution has not been +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the solution has not been *> computed. The factorization has not been completed *> unless i = N. *> \endverbatim diff --git a/lapack-netlib/SRC/cptsvx.f b/lapack-netlib/SRC/cptsvx.f index db63a3c36..6f7d8cf5b 100644 --- a/lapack-netlib/SRC/cptsvx.f +++ b/lapack-netlib/SRC/cptsvx.f @@ -60,7 +60,7 @@ *> factorization can also be regarded as having the form *> A = U**H*D*U. *> -*> 2. If the leading i-by-i principal minor is not positive definite, +*> 2. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -205,10 +205,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/cpttrf.f b/lapack-netlib/SRC/cpttrf.f index c6379e923..111343b78 100644 --- a/lapack-netlib/SRC/cpttrf.f +++ b/lapack-netlib/SRC/cpttrf.f @@ -71,8 +71,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite; if k < N, the factorization could not +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive; if k < N, the factorization could not *> be completed, while if k = N, the factorization was *> completed, but D(N) <= 0. *> \endverbatim diff --git a/lapack-netlib/SRC/crscl.c b/lapack-netlib/SRC/crscl.c new file mode 100644 index 000000000..7c87553d5 --- /dev/null +++ b/lapack-netlib/SRC/crscl.c @@ -0,0 +1,735 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download CRSCL + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CRSCL( N, A, X, INCX ) */ + +/* INTEGER INCX, N */ +/* COMPLEX A */ +/* COMPLEX X( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CRSCL multiplies an n-element complex vector x by the complex scalar */ +/* > 1/a. This is done without overflow or underflow as long as */ +/* > the final result x/a does not overflow or underflow. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of components of the vector x. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX */ +/* > The scalar a which is used to divide each component of x. */ +/* > A must not be 0, or the subroutine will divide by zero. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX array, dimension */ +/* > (1+(N-1)*abs(INCX)) */ +/* > The n-element vector x. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] INCX */ +/* > \verbatim */ +/* > INCX is INTEGER */ +/* > The increment between successive values of the vector X. */ +/* > > 0: X(1) = X(1) and X(1+(i-1)*INCX) = x(i), 1< i<= n */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complexOTHERauxiliary */ + +/* ===================================================================== */ +/* Subroutine */ int crscl_(integer *n, complex *a, complex *x, integer *incx) +{ + /* System generated locals */ + real r__1, r__2; + complex q__1; + + /* Local variables */ + real absi, absr; + extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + integer *); + real ai, ar, ui, ov, ur; + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *); + real safmin, safmax; + extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer + *); + + +/* -- LAPACK auxiliary routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Quick return if possible */ + + /* Parameter adjustments */ + --x; + + /* Function Body */ + if (*n <= 0) { + return 0; + } + +/* Get machine parameters */ + + safmin = slamch_("S"); + safmax = 1.f / safmin; + ov = slamch_("O"); + +/* Initialize constants related to A. */ + + ar = a->r; + ai = r_imag(a); + absr = abs(ar); + absi = abs(ai); + + if (ai == 0.f) { +/* If alpha is real, then we can use csrscl */ + csrscl_(n, &ar, &x[1], incx); + + } else if (ar == 0.f) { +/* If alpha has a zero real part, then we follow the same rules as if */ +/* alpha were real. */ + if (absi > safmax) { + csscal_(n, &safmin, &x[1], incx); + r__1 = -safmax / ai; + q__1.r = 0.f, q__1.i = r__1; + cscal_(n, &q__1, &x[1], incx); + } else if (absi < safmin) { + r__1 = -safmin / ai; + q__1.r = 0.f, q__1.i = r__1; + cscal_(n, &q__1, &x[1], incx); + csscal_(n, &safmax, &x[1], incx); + } else { + r__1 = -1.f / ai; + q__1.r = 0.f, q__1.i = r__1; + cscal_(n, &q__1, &x[1], incx); + } + + } else { +/* The following numbers can be computed. */ +/* They are the inverse of the real and imaginary parts of 1/alpha. */ +/* Note that a and b are always different from zero. */ +/* NaNs are only possible if either: */ +/* 1. alphaR or alphaI is NaN. */ +/* 2. alphaR and alphaI are both infinite, in which case it makes sense */ +/* to propagate a NaN. */ + ur = ar + ai * (ai / ar); + ui = ai + ar * (ar / ai); + + if (abs(ur) < safmin || abs(ui) < safmin) { +/* This means that both alphaR and alphaI are very small. */ + r__1 = safmin / ur; + r__2 = -safmin / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + csscal_(n, &safmax, &x[1], incx); + } else if (abs(ur) > safmax || abs(ui) > safmax) { + if (absr > ov || absi > ov) { +/* This means that a and b are both Inf. No need for scaling. */ + r__1 = 1.f / ur; + r__2 = -1.f / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + } else { + csscal_(n, &safmin, &x[1], incx); + if (abs(ur) > ov || abs(ui) > ov) { +/* Infs were generated. We do proper scaling to avoid them. */ + if (absr >= absi) { +/* ABS( UR ) <= ABS( UI ) */ + ur = safmin * ar + safmin * (ai * (ai / ar)); + ui = safmin * ai + ar * (safmin * ar / ai); + } else { +/* ABS( UR ) > ABS( UI ) */ + ur = safmin * ar + ai * (safmin * ai / ar); + ui = safmin * ai + safmin * (ar * (ar / ai)); + } + r__1 = 1.f / ur; + r__2 = -1.f / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + } else { + r__1 = safmax / ur; + r__2 = -safmax / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + } + } + } else { + r__1 = 1.f / ur; + r__2 = -1.f / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + } + } + + return 0; + +/* End of CRSCL */ + +} /* crscl_ */ + diff --git a/lapack-netlib/SRC/crscl.f b/lapack-netlib/SRC/crscl.f new file mode 100644 index 000000000..22919cd62 --- /dev/null +++ b/lapack-netlib/SRC/crscl.f @@ -0,0 +1,202 @@ +*> \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CRSCL + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CRSCL( N, A, X, INCX ) +* +* .. Scalar Arguments .. +* INTEGER INCX, N +* COMPLEX A +* .. +* .. Array Arguments .. +* COMPLEX X( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CRSCL multiplies an n-element complex vector x by the complex scalar +*> 1/a. This is done without overflow or underflow as long as +*> the final result x/a does not overflow or underflow. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of components of the vector x. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX +*> The scalar a which is used to divide each component of x. +*> A must not be 0, or the subroutine will divide by zero. +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX array, dimension +*> (1+(N-1)*abs(INCX)) +*> The n-element vector x. +*> \endverbatim +*> +*> \param[in] INCX +*> \verbatim +*> INCX is INTEGER +*> The increment between successive values of the vector X. +*> > 0: X(1) = X(1) and X(1+(i-1)*INCX) = x(i), 1< i<= n +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexOTHERauxiliary +* +* ===================================================================== + SUBROUTINE CRSCL( N, A, X, INCX ) +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INCX, N + COMPLEX A +* .. +* .. Array Arguments .. + COMPLEX X( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + REAL SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR + % , UI +* .. +* .. External Functions .. + REAL SLAMCH + COMPLEX CLADIV + EXTERNAL SLAMCH, CLADIV +* .. +* .. External Subroutines .. + EXTERNAL CSCAL, CSSCAL, CSRSCL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( N.LE.0 ) + $ RETURN +* +* Get machine parameters +* + SAFMIN = SLAMCH( 'S' ) + SAFMAX = ONE / SAFMIN + OV = SLAMCH( 'O' ) +* +* Initialize constants related to A. +* + AR = REAL( A ) + AI = AIMAG( A ) + ABSR = ABS( AR ) + ABSI = ABS( AI ) +* + IF( AI.EQ.ZERO ) THEN +* If alpha is real, then we can use csrscl + CALL CSRSCL( N, AR, X, INCX ) +* + ELSE IF( AR.EQ.ZERO ) THEN +* If alpha has a zero real part, then we follow the same rules as if +* alpha were real. + IF( ABSI.GT.SAFMAX ) THEN + CALL CSSCAL( N, SAFMIN, X, INCX ) + CALL CSCAL( N, CMPLX( ZERO, -SAFMAX / AI ), X, INCX ) + ELSE IF( ABSI.LT.SAFMIN ) THEN + CALL CSCAL( N, CMPLX( ZERO, -SAFMIN / AI ), X, INCX ) + CALL CSSCAL( N, SAFMAX, X, INCX ) + ELSE + CALL CSCAL( N, CMPLX( ZERO, -ONE / AI ), X, INCX ) + END IF +* + ELSE +* The following numbers can be computed. +* They are the inverse of the real and imaginary parts of 1/alpha. +* Note that a and b are always different from zero. +* NaNs are only possible if either: +* 1. alphaR or alphaI is NaN. +* 2. alphaR and alphaI are both infinite, in which case it makes sense +* to propagate a NaN. + UR = AR + AI * ( AI / AR ) + UI = AI + AR * ( AR / AI ) +* + IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN +* This means that both alphaR and alphaI are very small. + CALL CSCAL( N, CMPLX( SAFMIN / UR, -SAFMIN / UI ), X, INCX ) + CALL CSSCAL( N, SAFMAX, X, INCX ) + ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN + IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN +* This means that a and b are both Inf. No need for scaling. + CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX ) + ELSE + CALL CSSCAL( N, SAFMIN, X, INCX ) + IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN +* Infs were generated. We do proper scaling to avoid them. + IF( ABSR.GE.ABSI ) THEN +* ABS( UR ) <= ABS( UI ) + UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR )) + UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI ) + ELSE +* ABS( UR ) > ABS( UI ) + UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR ) + UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI )) + END IF + CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX ) + ELSE + CALL CSCAL( N, CMPLX( SAFMAX / UR, -SAFMAX / UI ), + $ X, INCX ) + END IF + END IF + ELSE + CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX ) + END IF + END IF +* + RETURN +* +* End of CRSCL +* + END diff --git a/lapack-netlib/SRC/cstedc.f b/lapack-netlib/SRC/cstedc.f index a57d9eaef..d7db591b3 100644 --- a/lapack-netlib/SRC/cstedc.f +++ b/lapack-netlib/SRC/cstedc.f @@ -43,12 +43,6 @@ *> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this *> matrix to tridiagonal form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See SLAED3 for details. *> \endverbatim * * Arguments: @@ -198,7 +192,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup stedc * *> \par Contributors: * ================== @@ -239,8 +233,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANST - EXTERNAL ILAENV, LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CLACPY, CLACRM, CLAED0, CSTEQR, CSWAP, @@ -301,7 +295,7 @@ LRWMIN = 1 + 4*N + 2*N**2 LIWMIN = 3 + 5*N END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -472,7 +466,7 @@ END IF * 70 CONTINUE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * diff --git a/lapack-netlib/SRC/cstegr.f b/lapack-netlib/SRC/cstegr.f index 9d6e06da3..a162d5076 100644 --- a/lapack-netlib/SRC/cstegr.f +++ b/lapack-netlib/SRC/cstegr.f @@ -56,7 +56,7 @@ *> *> Note : CSTEGR and CSTEMR work only on machines which follow *> IEEE-754 floating-point standard in their handling of infinities and -*> NaNs. Normal execution may create these exceptiona values and hence +*> NaNs. Normal execution may create these exceptional values and hence *> may abort due to a floating point exception in environments which *> do not conform to the IEEE-754 standard. *> \endverbatim diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f index d49684db3..46b20d880 100644 --- a/lapack-netlib/SRC/cstemr.f +++ b/lapack-netlib/SRC/cstemr.f @@ -320,7 +320,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup stemr * *> \par Contributors: * ================== @@ -329,7 +329,8 @@ *> Jim Demmel, University of California, Berkeley, USA \n *> Inderjit Dhillon, University of Texas, Austin, USA \n *> Osni Marques, LBNL/NERSC, USA \n -*> Christof Voemel, University of California, Berkeley, USA +*> Christof Voemel, University of California, Berkeley, USA \n +*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n * * ===================================================================== SUBROUTINE CSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, @@ -361,7 +362,8 @@ $ MINRGP = 3.0E-3 ) * .. * .. Local Scalars .. - LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY + LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY, + $ LAESWAP INTEGER I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW, $ IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD, $ INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP, @@ -374,8 +376,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANST - EXTERNAL LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARRV, CSWAP, SCOPY, SLAE2, SLAEV2, SLARRC, @@ -397,6 +399,7 @@ * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) + LAESWAP = .FALSE. * SSTEMR needs WORK of size 6*N, IWORK of size 3*N. * In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N. @@ -459,7 +462,7 @@ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( WANTZ .AND. ALLEIG ) THEN @@ -519,6 +522,15 @@ ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN ) END IF +* D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However, +* the following code requires R1 >= R2. Hence, we correct +* the order of R1, R2, CS, SN if R1 < R2 before further processing. + IF( R1.LT.R2 ) THEN + E(2) = R1 + R1 = R2 + R2 = E(2) + LAESWAP = .TRUE. + ENDIF IF( ALLEIG.OR. $ (VALEIG.AND.(R2.GT.WL).AND. $ (R2.LE.WU)).OR. @@ -526,8 +538,13 @@ M = M+1 W( M ) = R2 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = -SN - Z( 2, M ) = CS + IF( LAESWAP ) THEN + Z( 1, M ) = CS + Z( 2, M ) = SN + ELSE + Z( 1, M ) = -SN + Z( 2, M ) = CS + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -550,8 +567,13 @@ M = M+1 W( M ) = R1 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = CS - Z( 2, M ) = SN + IF( LAESWAP ) THEN + Z( 1, M ) = -SN + Z( 2, M ) = CS + ELSE + Z( 1, M ) = CS + Z( 2, M ) = SN + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -779,7 +801,7 @@ ENDIF * * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/csyconvf.f b/lapack-netlib/SRC/csyconvf.f index f3b46f0b4..febbc411c 100644 --- a/lapack-netlib/SRC/csyconvf.f +++ b/lapack-netlib/SRC/csyconvf.f @@ -39,7 +39,7 @@ *> CSYTRF provided on entry in parameter A into the factorization *> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored *> on exit in parameters A and E. It also converts in place details of -*> the intechanges stored in IPIV from the format used in CSYTRF into +*> the interchanges stored in IPIV from the format used in CSYTRF into *> the format used in CSYTRF_RK (or CSYTRF_BK). *> *> If parameter WAY = 'R': @@ -48,7 +48,7 @@ *> (or CSYTRF_BK) provided on entry in parameters A and E into *> the factorization output format used in CSYTRF that is stored *> on exit in parameter A. It also converts in place details of -*> the intechanges stored in IPIV from the format used in CSYTRF_RK +*> the interchanges stored in IPIV from the format used in CSYTRF_RK *> (or CSYTRF_BK) into the format used in CSYTRF. *> *> CSYCONVF can also convert in Hermitian matrix case, i.e. between @@ -325,7 +325,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -469,7 +469,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -535,7 +535,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/csyconvf_rook.f b/lapack-netlib/SRC/csyconvf_rook.f index a5b9d82da..0da34e0fa 100644 --- a/lapack-netlib/SRC/csyconvf_rook.f +++ b/lapack-netlib/SRC/csyconvf_rook.f @@ -520,7 +520,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/csysv.f b/lapack-netlib/SRC/csysv.f index 4ddabf62f..a2d1e7cbe 100644 --- a/lapack-netlib/SRC/csysv.f +++ b/lapack-netlib/SRC/csysv.f @@ -163,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesv * * ===================================================================== SUBROUTINE CSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -190,7 +190,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CSYTRF, CSYTRS, CSYTRS2 @@ -225,7 +226,7 @@ CALL CSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -258,7 +259,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f index 8548c2789..571a91123 100644 --- a/lapack-netlib/SRC/csysv_aa.f +++ b/lapack-netlib/SRC/csysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE CSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -182,7 +182,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CSYTRF_AA, CSYTRS_AA @@ -217,7 +218,7 @@ $ -1, INFO ) LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -239,7 +240,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f index d8881a748..10119d8ba 100644 --- a/lapack-netlib/SRC/csysv_aa_2stage.f +++ b/lapack-netlib/SRC/csysv_aa_2stage.f @@ -87,7 +87,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -177,7 +177,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE CSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -207,7 +207,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CSYTRF_AA_2STAGE, @@ -267,7 +268,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysv_rk.f b/lapack-netlib/SRC/csysv_rk.f index ef5334dcd..cb98ab1dc 100644 --- a/lapack-netlib/SRC/csysv_rk.f +++ b/lapack-netlib/SRC/csysv_rk.f @@ -205,7 +205,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesv_rk * *> \par Contributors: * ================== @@ -247,7 +247,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CSYTRF_RK, CSYTRS_3 @@ -282,7 +283,7 @@ CALL CSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -304,7 +305,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysv_rook.f b/lapack-netlib/SRC/csysv_rook.f index aad594e21..8798ddfb2 100644 --- a/lapack-netlib/SRC/csysv_rook.f +++ b/lapack-netlib/SRC/csysv_rook.f @@ -181,7 +181,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesv_rook * *> \par Contributors: * ================== @@ -223,7 +223,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CSYTRF_ROOK, CSYTRS_ROOK @@ -258,7 +259,7 @@ CALL CSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -281,7 +282,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysvx.f b/lapack-netlib/SRC/csysvx.f index 2afa082a9..3c7a37889 100644 --- a/lapack-netlib/SRC/csysvx.f +++ b/lapack-netlib/SRC/csysvx.f @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE CSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -313,8 +313,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANSY, SLAMCH - EXTERNAL ILAENV, LSAME, CLANSY, SLAMCH + REAL CLANSY, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, CLANSY, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CSYCON, CSYRFS, CSYTRF, CSYTRS, XERBLA @@ -356,7 +356,7 @@ NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -405,7 +405,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f index ebf228f18..519e78490 100644 --- a/lapack-netlib/SRC/csytrf.f +++ b/lapack-netlib/SRC/csytrf.f @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -202,7 +202,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLASYF, CSYTF2, XERBLA @@ -232,8 +233,8 @@ * Determine the block size * NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -351,7 +352,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CSYTRF diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f index b1165a425..cf994913d 100644 --- a/lapack-netlib/SRC/csytrf_aa.f +++ b/lapack-netlib/SRC/csytrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> @@ -125,7 +125,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== SUBROUTINE CSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) @@ -159,7 +159,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLASYF_AA, CGEMM, CGEMV, CSCAL, CSWAP, CCOPY, @@ -191,7 +192,7 @@ * IF( INFO.EQ.0 ) THEN LWKOPT = (NB+1)*N - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -457,7 +458,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CSYTRF_AA diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f index 8a2cfd7bc..e56aedaf6 100644 --- a/lapack-netlib/SRC/csytrf_aa_2stage.f +++ b/lapack-netlib/SRC/csytrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -188,7 +188,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CCOPY, CGBTRF, CGEMM, CGETRF, CLACPY, @@ -230,7 +231,7 @@ TB( 1 ) = (3*NB+1)*N END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK(N*NB) END IF END IF IF( TQUERY .OR. WQUERY ) THEN diff --git a/lapack-netlib/SRC/csytrf_rk.f b/lapack-netlib/SRC/csytrf_rk.f index 9c2b7182f..de39bda41 100644 --- a/lapack-netlib/SRC/csytrf_rk.f +++ b/lapack-netlib/SRC/csytrf_rk.f @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -280,7 +280,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLASYF_RK, CSYTF2_RK, CSWAP, XERBLA @@ -310,8 +311,8 @@ * Determine the block size * NB = ILAENV( 1, 'CSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -487,7 +488,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CSYTRF_RK diff --git a/lapack-netlib/SRC/csytrf_rook.f b/lapack-netlib/SRC/csytrf_rook.f index ce7c1e586..72fe0629f 100644 --- a/lapack-netlib/SRC/csytrf_rook.f +++ b/lapack-netlib/SRC/csytrf_rook.f @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== @@ -228,7 +228,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLASYF_ROOK, CSYTF2_ROOK, XERBLA @@ -259,7 +260,7 @@ * NB = ILAENV( 1, 'CSYTRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -382,7 +383,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CSYTRF_ROOK diff --git a/lapack-netlib/SRC/csytri_3.f b/lapack-netlib/SRC/csytri_3.f index 279f62853..604d84b21 100644 --- a/lapack-netlib/SRC/csytri_3.f +++ b/lapack-netlib/SRC/csytri_3.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -190,7 +190,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CSYTRI_3X, XERBLA @@ -225,7 +226,7 @@ CALL XERBLA( 'CSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -236,7 +237,7 @@ * CALL CSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csytrs_aa.f b/lapack-netlib/SRC/csytrs_aa.f index 1f6ea40af..7f63539a6 100644 --- a/lapack-netlib/SRC/csytrs_aa.f +++ b/lapack-netlib/SRC/csytrs_aa.f @@ -123,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE CSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -155,7 +155,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CGTSV, CSWAP, CTRSM, XERBLA @@ -186,7 +187,7 @@ RETURN ELSE IF( LQUERY ) THEN LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * diff --git a/lapack-netlib/SRC/ctgevc.f b/lapack-netlib/SRC/ctgevc.f index 4e5289cb2..22144f259 100644 --- a/lapack-netlib/SRC/ctgevc.f +++ b/lapack-netlib/SRC/ctgevc.f @@ -53,7 +53,7 @@ *> *> S*x = w*P*x, (y**H)*S = w*(y**H)*P, *> -*> where y**H denotes the conjugate tranpose of y. +*> where y**H denotes the conjugate transpose of y. *> The eigenvalues are not input to this routine, but are computed *> directly from the diagonal elements of S and P. *> @@ -154,7 +154,7 @@ *> \verbatim *> VR is COMPLEX array, dimension (LDVR,MM) *> On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must -*> contain an N-by-N matrix Q (usually the unitary matrix Z +*> contain an N-by-N matrix Z (usually the unitary matrix Z *> of right Schur vectors returned by CHGEQZ). *> On exit, if SIDE = 'R' or 'B', VR contains: *> if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P); @@ -259,7 +259,7 @@ EXTERNAL LSAME, SLAMCH, CLADIV * .. * .. External Subroutines .. - EXTERNAL CGEMV, SLABAD, XERBLA + EXTERNAL CGEMV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL @@ -367,7 +367,6 @@ * SAFMIN = SLAMCH( 'Safe minimum' ) BIG = ONE / SAFMIN - CALL SLABAD( SAFMIN, BIG ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) SMALL = SAFMIN*N / ULP BIG = ONE / SMALL diff --git a/lapack-netlib/SRC/ctgsen.f b/lapack-netlib/SRC/ctgsen.f index f9b6cd10c..180e96b32 100644 --- a/lapack-netlib/SRC/ctgsen.f +++ b/lapack-netlib/SRC/ctgsen.f @@ -290,7 +290,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup tgsen * *> \par Further Details: * ===================== @@ -339,7 +339,7 @@ *> [ kron(In2, B11) -kron(B22**H, In1) ]. *> *> Here, Inx is the identity matrix of size nx and A22**H is the -*> conjuguate transpose of A22. kron(X, Y) is the Kronecker product between +*> conjugate transpose of A22. kron(X, Y) is the Kronecker product between *> the matrices X and Y. *> *> When DIF(2) is small, small changes in (A, B) can cause large changes @@ -467,6 +467,10 @@ * .. Local Arrays .. INTEGER ISAVE( 3 ) * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. REAL SLAMCH EXTERNAL CLACN2, CLACPY, CLASSQ, CSCAL, CTGEXC, CTGSYL, @@ -537,7 +541,7 @@ LIWMIN = 1 END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -771,7 +775,7 @@ * 70 CONTINUE * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ctgsna.f b/lapack-netlib/SRC/ctgsna.f index 2295dc5cc..50498c413 100644 --- a/lapack-netlib/SRC/ctgsna.f +++ b/lapack-netlib/SRC/ctgsna.f @@ -213,7 +213,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup tgsna * *> \par Further Details: * ===================== @@ -343,12 +343,13 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SCNRM2, SLAMCH, SLAPY2 + REAL SCNRM2, SLAMCH, SLAPY2, SROUNDUP_LWORK COMPLEX CDOTC - EXTERNAL LSAME, SCNRM2, SLAMCH, SLAPY2, CDOTC + EXTERNAL LSAME, SCNRM2, SLAMCH, SLAPY2, SROUNDUP_LWORK, + $ CDOTC * .. * .. External Subroutines .. - EXTERNAL CGEMV, CLACPY, CTGEXC, CTGSYL, SLABAD, XERBLA + EXTERNAL CGEMV, CLACPY, CTGEXC, CTGSYL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, MAX @@ -402,7 +403,7 @@ ELSE LWMIN = N END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( MM.LT.M ) THEN INFO = -15 @@ -428,7 +429,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) KS = 0 DO 20 K = 1, N * @@ -508,7 +508,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of CTGSNA diff --git a/lapack-netlib/SRC/ctgsyl.f b/lapack-netlib/SRC/ctgsyl.f index ae1437125..620556399 100644 --- a/lapack-netlib/SRC/ctgsyl.f +++ b/lapack-netlib/SRC/ctgsyl.f @@ -260,7 +260,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup tgsyl * *> \par Contributors: * ================== @@ -329,7 +329,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMM, CLACPY, CLASET, CSCAL, CTGSY2, XERBLA @@ -382,7 +383,7 @@ ELSE LWMIN = 1 END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -683,7 +684,7 @@ 210 CONTINUE END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f index 0f58696b2..13cbf553f 100644 --- a/lapack-netlib/SRC/ctrevc3.f +++ b/lapack-netlib/SRC/ctrevc3.f @@ -222,7 +222,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup trevc3 * *> \par Further Details: * ===================== @@ -278,12 +278,13 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ICAMAX - REAL SLAMCH, SCASUM - EXTERNAL LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM + REAL SLAMCH, SCASUM, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CCOPY, CLASET, CSSCAL, CGEMM, CGEMV, - $ CLATRS, CLACPY, SLABAD + $ CLATRS, CLACPY * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, CMPLX, CONJG, AIMAG, MAX @@ -321,9 +322,9 @@ * INFO = 0 NB = ILAENV( 1, 'CTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB - WORK(1) = MAXWRK - RWORK(1) = N + MAXWRK = MAX( 1, N + 2*N*NB ) + WORK(1) = SROUNDUP_LWORK(MAXWRK) + RWORK(1) = MAX( 1, N ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 @@ -371,7 +372,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Precision' ) SMLNUM = UNFL*( N / ULP ) * diff --git a/lapack-netlib/SRC/ctrexc.f b/lapack-netlib/SRC/ctrexc.f index ea64ddf9b..2bc0348fb 100644 --- a/lapack-netlib/SRC/ctrexc.f +++ b/lapack-netlib/SRC/ctrexc.f @@ -40,7 +40,7 @@ *> *> The Schur form T is reordered by a unitary similarity transformation *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by -*> postmultplying it with Z. +*> postmultiplying it with Z. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/ctrsen.f b/lapack-netlib/SRC/ctrsen.f index d93b97be6..9d59f6bf2 100644 --- a/lapack-netlib/SRC/ctrsen.f +++ b/lapack-netlib/SRC/ctrsen.f @@ -182,7 +182,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup trsen * *> \par Further Details: * ===================== @@ -293,8 +293,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE - EXTERNAL LSAME, CLANGE + REAL CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACN2, CLACPY, CTREXC, CTRSYL, XERBLA @@ -350,7 +350,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) END IF * IF( INFO.NE.0 ) THEN @@ -444,7 +444,7 @@ W( K ) = T( K, K ) 50 CONTINUE * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ctzrzf.f b/lapack-netlib/SRC/ctzrzf.f index b21f092ce..ac3f59400 100644 --- a/lapack-netlib/SRC/ctzrzf.f +++ b/lapack-netlib/SRC/ctzrzf.f @@ -116,7 +116,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup tzrzf * *> \par Contributors: * ================== @@ -179,7 +179,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -207,7 +208,7 @@ LWKOPT = M*NB LWKMIN = MAX( 1, M ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 @@ -301,7 +302,7 @@ IF( MU.GT.0 ) $ CALL CLATRZ( MU, N, N-M, A, LDA, TAU, WORK ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cunbdb.f b/lapack-netlib/SRC/cunbdb.f index a41895dc8..b45dcfde6 100644 --- a/lapack-netlib/SRC/cunbdb.f +++ b/lapack-netlib/SRC/cunbdb.f @@ -255,7 +255,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb * *> \par Further Details: * ===================== @@ -320,9 +320,9 @@ * * .. * .. External Functions .. - REAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK LOGICAL LSAME - EXTERNAL SCNRM2, LSAME + EXTERNAL SCNRM2, SROUNDUP_LWORK, LSAME * .. * .. Intrinsic Functions INTRINSIC ATAN2, COS, MAX, MIN, SIN @@ -377,7 +377,7 @@ IF( INFO .EQ. 0 ) THEN LWORKOPT = M - Q LWORKMIN = M - Q - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT. LQUERY ) THEN INFO = -21 END IF diff --git a/lapack-netlib/SRC/cunbdb1.f b/lapack-netlib/SRC/cunbdb1.f index 740e38a85..a4875ab5b 100644 --- a/lapack-netlib/SRC/cunbdb1.f +++ b/lapack-netlib/SRC/cunbdb1.f @@ -37,7 +37,7 @@ *>\verbatim *> *> CUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb1 * *> \par Further Details: * ===================== @@ -230,8 +230,8 @@ EXTERNAL CLACGV * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK + EXTERNAL SCNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC ATAN2, COS, MAX, SIN, SQRT @@ -264,7 +264,7 @@ LORBDB5 = Q-2 LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 ) LWORKMIN = LWORKOPT - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f index b45db6100..6399964f8 100644 --- a/lapack-netlib/SRC/cunbdb2.f +++ b/lapack-netlib/SRC/cunbdb2.f @@ -37,7 +37,7 @@ *>\verbatim *> *> CUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb2 * *> \par Further Details: * ===================== @@ -231,8 +231,8 @@ $ XERBLA * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK + EXTERNAL SCNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC ATAN2, COS, MAX, SIN, SQRT @@ -265,7 +265,7 @@ LORBDB5 = Q-1 LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 ) LWORKMIN = LWORKOPT - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF diff --git a/lapack-netlib/SRC/cunbdb3.f b/lapack-netlib/SRC/cunbdb3.f index b532bfbc8..d02460597 100644 --- a/lapack-netlib/SRC/cunbdb3.f +++ b/lapack-netlib/SRC/cunbdb3.f @@ -37,7 +37,7 @@ *>\verbatim *> *> CUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb3 * *> \par Further Details: * ===================== @@ -229,8 +229,8 @@ EXTERNAL CLARF, CLARFGP, CUNBDB5, CSROT, CLACGV, XERBLA * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK + EXTERNAL SCNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC ATAN2, COS, MAX, SIN, SQRT @@ -263,7 +263,7 @@ LORBDB5 = Q-1 LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 ) LWORKMIN = LWORKOPT - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f index 117f23d08..33acc1ee5 100644 --- a/lapack-netlib/SRC/cunbdb4.f +++ b/lapack-netlib/SRC/cunbdb4.f @@ -38,7 +38,7 @@ *>\verbatim *> *> CUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] @@ -183,7 +183,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb4 * *> \par Further Details: * ===================== @@ -242,8 +242,8 @@ $ XERBLA * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK + EXTERNAL SCNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC ATAN2, COS, MAX, SIN, SQRT @@ -277,7 +277,7 @@ LWORKOPT = ILARF + LLARF - 1 LWORKOPT = MAX( LWORKOPT, IORBDB5 + LORBDB5 - 1 ) LWORKMIN = LWORKOPT - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF diff --git a/lapack-netlib/SRC/cunbdb5.f b/lapack-netlib/SRC/cunbdb5.f index d2ff4e700..22513cf8b 100644 --- a/lapack-netlib/SRC/cunbdb5.f +++ b/lapack-netlib/SRC/cunbdb5.f @@ -148,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb5 * * ===================================================================== SUBROUTINE CUNBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -169,18 +169,21 @@ * ===================================================================== * * .. Parameters .. + REAL REALZERO + PARAMETER ( REALZERO = 0.0E0 ) COMPLEX ONE, ZERO PARAMETER ( ONE = (1.0E0,0.0E0), ZERO = (0.0E0,0.0E0) ) * .. * .. Local Scalars .. INTEGER CHILDINFO, I, J + REAL EPS, NORM, SCL, SSQ * .. * .. External Subroutines .. - EXTERNAL CUNBDB6, XERBLA + EXTERNAL CLASSQ, CUNBDB6, CSCAL, XERBLA * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SLAMCH, SCNRM2 + EXTERNAL SLAMCH, SCNRM2 * .. * .. Intrinsic Function .. INTRINSIC MAX @@ -213,16 +216,33 @@ RETURN END IF * -* Project X onto the orthogonal complement of Q + EPS = SLAMCH( 'Precision' ) * - CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2, - $ WORK, LWORK, CHILDINFO ) +* Project X onto the orthogonal complement of Q if X is nonzero * -* If the projection is nonzero, then return + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) * - IF( SCNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN - RETURN + IF( NORM .GT. N * EPS ) THEN +* Scale vector to unit norm to avoid problems in the caller code. +* Computing the reciprocal is undesirable but +* * xLASCL cannot be used because of the vector increments and +* * the round-off error has a negligible impact on +* orthogonalization. + CALL CSCAL( M1, ONE / NORM, X1, INCX1 ) + CALL CSCAL( M2, ONE / NORM, X2, INCX2 ) + CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, + $ LDQ2, WORK, LWORK, CHILDINFO ) +* +* If the projection is nonzero, then return +* + IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN + RETURN + END IF END IF * * Project each standard basis vector e_1,...,e_M1 in turn, stopping @@ -238,8 +258,8 @@ END DO CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( SCNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO @@ -257,8 +277,8 @@ X2(I) = ONE CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( SCNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f index b93a389d6..566fd76b7 100644 --- a/lapack-netlib/SRC/cunbdb6.f +++ b/lapack-netlib/SRC/cunbdb6.f @@ -41,9 +41,8 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The Euclidean norm of X must be one and the columns of Q must be -*> orthonormal. The orthogonalized vector will be zero if and only if it -*> lies entirely in the range of Q. +*> The columns of Q must be orthonormal. The orthogonalized vector will +*> be zero if and only if it lies entirely in the range of Q. *> *> The projection is computed with at most two iterations of the *> classical Gram-Schmidt algorithm, see @@ -152,7 +151,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb6 * * ===================================================================== SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -174,7 +173,7 @@ * * .. Parameters .. REAL ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, + PARAMETER ( ALPHA = 0.83E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) COMPLEX NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0), @@ -223,14 +222,16 @@ * EPS = SLAMCH( 'Precision' ) * +* Compute the Euclidean norm of X +* + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) +* * First, project X onto the orthogonal complement of Q's column * space -* -* Christoph Conrads: In debugging mode the norm should be computed -* and an assertion added comparing the norm with one. Alas, Fortran -* never made it into 1989 when assert() was introduced into the C -* programming language. - NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N diff --git a/lapack-netlib/SRC/cuncsd.f b/lapack-netlib/SRC/cuncsd.f index 3653a396a..003daaab4 100644 --- a/lapack-netlib/SRC/cuncsd.f +++ b/lapack-netlib/SRC/cuncsd.f @@ -308,7 +308,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup uncsd * * ===================================================================== RECURSIVE SUBROUTINE CUNCSD( JOBU1, JOBU2, JOBV1T, JOBV2T, TRANS, @@ -365,7 +365,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. Intrinsic Functions INTRINSIC INT, MAX, MIN @@ -504,7 +505,8 @@ $ IORBDB + LORBDBWORKOPT ) - 1 LWORKMIN = MAX( IORGQR + LORGQRWORKMIN, IORGLQ + LORGLQWORKMIN, $ IORBDB + LORBDBWORKMIN ) - 1 - WORK(1) = MAX(LWORKOPT,LWORKMIN) + LWORKOPT = MAX(LWORKOPT,LWORKMIN) + WORK(1) = SROUNDUP_LWORK(LWORKOPT) * IF( LWORK .LT. LWORKMIN $ .AND. .NOT. ( LQUERY .OR. LRQUERY ) ) THEN diff --git a/lapack-netlib/SRC/cuncsd2by1.f b/lapack-netlib/SRC/cuncsd2by1.f index 8360d5932..128e82cec 100644 --- a/lapack-netlib/SRC/cuncsd2by1.f +++ b/lapack-netlib/SRC/cuncsd2by1.f @@ -212,13 +212,13 @@ *> LRWORK is INTEGER *> The dimension of the array RWORK. *> -*> If LRWORK = -1, then a workspace query is assumed; the routine +*> If LRWORK=-1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK and RWORK *> arrays, returns this value as the first entry of the WORK *> and RWORK array, respectively, and no error message related *> to LWORK or LRWORK is issued by XERBLA. *> \endverbatim -* +*> *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q)) @@ -247,7 +247,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup uncsd2by1 * * ===================================================================== SUBROUTINE CUNCSD2BY1( JOBU1, JOBU2, JOBV1T, M, P, Q, X11, LDX11, @@ -299,7 +299,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC INT, MAX, MIN @@ -508,7 +509,7 @@ LWORKOPT = MAX( IORBDB+LORBDB-1, $ IORGQR+LORGQROPT-1, $ IORGLQ+LORGLQOPT-1 ) - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f index a31a53d79..2f0208fdb 100644 --- a/lapack-netlib/SRC/cungbr.f +++ b/lapack-netlib/SRC/cungbr.f @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGBcomputational +*> \ingroup ungbr * * ===================================================================== SUBROUTINE CUNGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -180,7 +180,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNGLQ, CUNGQR, XERBLA @@ -241,7 +242,7 @@ CALL XERBLA( 'CUNGBR', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -327,7 +328,7 @@ END IF END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNGBR diff --git a/lapack-netlib/SRC/cunghr.f b/lapack-netlib/SRC/cunghr.f index 4f8a0a263..3aa3fb1ae 100644 --- a/lapack-netlib/SRC/cunghr.f +++ b/lapack-netlib/SRC/cunghr.f @@ -119,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unghr * * ===================================================================== SUBROUTINE CUNGHR( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO ) @@ -151,7 +151,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +179,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'CUNGQR', ' ', NH, NH, NH, -1 ) LWKOPT = MAX( 1, NH )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -230,7 +231,7 @@ CALL CUNGQR( NH, NH, NH, A( ILO+1, ILO+1 ), LDA, TAU( ILO ), $ WORK, LWORK, IINFO ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNGHR diff --git a/lapack-netlib/SRC/cunglq.f b/lapack-netlib/SRC/cunglq.f index e250e036c..353715054 100644 --- a/lapack-netlib/SRC/cunglq.f +++ b/lapack-netlib/SRC/cunglq.f @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unglq * * ===================================================================== SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -155,7 +155,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -164,7 +165,7 @@ INFO = 0 NB = ILAENV( 1, 'CUNGLQ', ' ', M, N, K, -1 ) LWKOPT = MAX( 1, M )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -278,7 +279,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CUNGLQ diff --git a/lapack-netlib/SRC/cungql.f b/lapack-netlib/SRC/cungql.f index d3b812a62..ed2f6803c 100644 --- a/lapack-netlib/SRC/cungql.f +++ b/lapack-netlib/SRC/cungql.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ungql * * ===================================================================== SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,7 +182,7 @@ NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN INFO = -8 diff --git a/lapack-netlib/SRC/cungqr.f b/lapack-netlib/SRC/cungqr.f index 5010ae0df..b6e8cc59a 100644 --- a/lapack-netlib/SRC/cungqr.f +++ b/lapack-netlib/SRC/cungqr.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ungqr * * ===================================================================== SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -165,7 +166,7 @@ INFO = 0 NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 ) LWKOPT = MAX( 1, N )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -279,7 +280,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CUNGQR diff --git a/lapack-netlib/SRC/cungrq.f b/lapack-netlib/SRC/cungrq.f index 1593ff938..aceaac0b8 100644 --- a/lapack-netlib/SRC/cungrq.f +++ b/lapack-netlib/SRC/cungrq.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ungrq * * ===================================================================== SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,7 +182,7 @@ NB = ILAENV( 1, 'CUNGRQ', ' ', M, N, K, -1 ) LWKOPT = M*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN INFO = -8 @@ -286,7 +287,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CUNGRQ diff --git a/lapack-netlib/SRC/cungtr.f b/lapack-netlib/SRC/cungtr.f index 26ff0428e..27f197340 100644 --- a/lapack-netlib/SRC/cungtr.f +++ b/lapack-netlib/SRC/cungtr.f @@ -116,7 +116,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ungtr * * ===================================================================== SUBROUTINE CUNGTR( UPLO, N, A, LDA, TAU, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNGQL, CUNGQR, XERBLA @@ -179,7 +180,7 @@ NB = ILAENV( 1, 'CUNGQR', ' ', N-1, N-1, N-1, -1 ) END IF LWKOPT = MAX( 1, N-1 )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -245,7 +246,7 @@ $ LWORK, IINFO ) END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNGTR diff --git a/lapack-netlib/SRC/cungtsqr.f b/lapack-netlib/SRC/cungtsqr.f index 64ccb9731..1734be2aa 100644 --- a/lapack-netlib/SRC/cungtsqr.f +++ b/lapack-netlib/SRC/cungtsqr.f @@ -133,6 +133,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= (M+NB)*N. *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK @@ -302,4 +303,4 @@ * * End of CUNGTSQR * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/cunmbr.f b/lapack-netlib/SRC/cunmbr.f index cef6025b0..a21c486e9 100644 --- a/lapack-netlib/SRC/cunmbr.f +++ b/lapack-netlib/SRC/cunmbr.f @@ -189,7 +189,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmbr * * ===================================================================== SUBROUTINE CUNMBR( VECT, SIDE, TRANS, M, N, K, A, LDA, TAU, C, @@ -218,7 +218,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNMLQ, CUNMQR, XERBLA @@ -290,7 +291,7 @@ ELSE LWKOPT = 1 END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +368,7 @@ $ TAU, C( I1, I2 ), LDC, WORK, LWORK, IINFO ) END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMBR diff --git a/lapack-netlib/SRC/cunmhr.f b/lapack-netlib/SRC/cunmhr.f index af3140d5f..29bb631f1 100644 --- a/lapack-netlib/SRC/cunmhr.f +++ b/lapack-netlib/SRC/cunmhr.f @@ -171,7 +171,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmhr * * ===================================================================== SUBROUTINE CUNMHR( SIDE, TRANS, M, N, ILO, IHI, A, LDA, TAU, C, @@ -199,7 +199,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNMQR, XERBLA @@ -253,7 +254,7 @@ NB = ILAENV( 1, 'CUNMQR', SIDE // TRANS, M, NH, NH, -1 ) END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -285,7 +286,7 @@ CALL CUNMQR( SIDE, TRANS, MI, NI, NH, A( ILO+1, ILO ), LDA, $ TAU( ILO ), C( I1, I2 ), LDC, WORK, LWORK, IINFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMHR diff --git a/lapack-netlib/SRC/cunmlq.f b/lapack-netlib/SRC/cunmlq.f index 25a410770..4da1af1d5 100644 --- a/lapack-netlib/SRC/cunmlq.f +++ b/lapack-netlib/SRC/cunmlq.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmlq * * ===================================================================== SUBROUTINE CUNMLQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -195,7 +195,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARFB, CLARFT, CUNML2, XERBLA @@ -250,7 +251,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -343,7 +344,7 @@ $ C( IC, JC ), LDC, WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMLQ diff --git a/lapack-netlib/SRC/cunmql.f b/lapack-netlib/SRC/cunmql.f index 3c7166066..84fc29d32 100644 --- a/lapack-netlib/SRC/cunmql.f +++ b/lapack-netlib/SRC/cunmql.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmql * * ===================================================================== SUBROUTINE CUNMQL( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -194,7 +194,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARFB, CLARFT, CUNM2L, XERBLA @@ -249,7 +250,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -332,7 +333,7 @@ $ WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMQL diff --git a/lapack-netlib/SRC/cunmqr.f b/lapack-netlib/SRC/cunmqr.f index 7e59d7129..7d85a861f 100644 --- a/lapack-netlib/SRC/cunmqr.f +++ b/lapack-netlib/SRC/cunmqr.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmqr * * ===================================================================== SUBROUTINE CUNMQR( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -194,7 +194,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARFB, CLARFT, CUNM2R, XERBLA @@ -245,7 +246,7 @@ NB = MIN( NBMAX, ILAENV( 1, 'CUNMQR', SIDE // TRANS, M, N, K, $ -1 ) ) LWKOPT = NW*NB + TSIZE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -331,7 +332,7 @@ $ C( IC, JC ), LDC, WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMQR diff --git a/lapack-netlib/SRC/cunmrq.f b/lapack-netlib/SRC/cunmrq.f index 5a233f604..f02cfd9a9 100644 --- a/lapack-netlib/SRC/cunmrq.f +++ b/lapack-netlib/SRC/cunmrq.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmrq * * ===================================================================== SUBROUTINE CUNMRQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -195,7 +195,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARFB, CLARFT, CUNMR2, XERBLA @@ -250,7 +251,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -337,7 +338,7 @@ $ WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMRQ diff --git a/lapack-netlib/SRC/cunmrz.f b/lapack-netlib/SRC/cunmrz.f index 8e06f2329..9ccf1878b 100644 --- a/lapack-netlib/SRC/cunmrz.f +++ b/lapack-netlib/SRC/cunmrz.f @@ -168,7 +168,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmrz * *> \par Contributors: * ================== @@ -213,7 +213,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARZB, CLARZT, CUNMR3, XERBLA @@ -271,7 +272,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -371,7 +372,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cunmtr.f b/lapack-netlib/SRC/cunmtr.f index 097dba91e..6eafc15c4 100644 --- a/lapack-netlib/SRC/cunmtr.f +++ b/lapack-netlib/SRC/cunmtr.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmtr * * ===================================================================== SUBROUTINE CUNMTR( SIDE, UPLO, TRANS, M, N, A, LDA, TAU, C, LDC, @@ -192,7 +192,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNMQL, CUNMQR, XERBLA @@ -256,7 +257,7 @@ END IF END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -301,7 +302,7 @@ CALL CUNMQR( SIDE, TRANS, MI, NI, NQ-1, A( 2, 1 ), LDA, TAU, $ C( I1, I2 ), LDC, WORK, LWORK, IINFO ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMTR diff --git a/lapack-netlib/SRC/dbbcsd.f b/lapack-netlib/SRC/dbbcsd.f index 4fcf9ab5d..913f96a73 100644 --- a/lapack-netlib/SRC/dbbcsd.f +++ b/lapack-netlib/SRC/dbbcsd.f @@ -805,7 +805,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/dbdsdc.f b/lapack-netlib/SRC/dbdsdc.f index 99fe82296..4b6c3e694 100644 --- a/lapack-netlib/SRC/dbdsdc.f +++ b/lapack-netlib/SRC/dbdsdc.f @@ -45,13 +45,6 @@ *> respectively. DBDSDC can be used to compute all singular values, *> and optionally, singular vectors or singular vectors in compact form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See DLASD3 for details. -*> *> The code currently calls DLASDQ if singular values only are desired. *> However, it can be slightly modified to compute singular values *> using the divide and conquer method. diff --git a/lapack-netlib/SRC/dbdsqr.f b/lapack-netlib/SRC/dbdsqr.f index c220a5875..bc697a007 100644 --- a/lapack-netlib/SRC/dbdsqr.f +++ b/lapack-netlib/SRC/dbdsqr.f @@ -278,7 +278,7 @@ $ MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM DOUBLE PRECISION ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, - $ SINR, SLL, SMAX, SMIN, SMINL, SMINOA, + $ SINR, SLL, SMAX, SMIN, SMINOA, $ SN, THRESH, TOL, TOLMUL, UNFL * .. * .. External Functions .. @@ -391,7 +391,7 @@ DO 30 I = 1, N - 1 SMAX = MAX( SMAX, ABS( E( I ) ) ) 30 CONTINUE - SMINL = ZERO + SMIN = ZERO IF( TOL.GE.ZERO ) THEN * * Relative accuracy desired @@ -451,7 +451,6 @@ IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH ) $ D( M ) = ZERO SMAX = ABS( D( M ) ) - SMIN = SMAX DO 70 LLL = 1, M - 1 LL = M - LLL ABSS = ABS( D( LL ) ) @@ -460,7 +459,6 @@ $ D( LL ) = ZERO IF( ABSE.LE.THRESH ) $ GO TO 80 - SMIN = MIN( SMIN, ABSS ) SMAX = MAX( SMAX, ABSS, ABSE ) 70 CONTINUE LL = 0 @@ -542,14 +540,14 @@ * apply convergence criterion forward * MU = ABS( D( LL ) ) - SMINL = MU + SMIN = MU DO 100 LLL = LL, M - 1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 100 CONTINUE END IF * @@ -570,14 +568,14 @@ * apply convergence criterion backward * MU = ABS( D( M ) ) - SMINL = MU + SMIN = MU DO 110 LLL = M - 1, LL, -1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 110 CONTINUE END IF END IF @@ -587,7 +585,7 @@ * Compute shift. First, test if shifting would ruin relative * accuracy, and if so set the shift to zero. * - IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE. + IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE. $ MAX( EPS, HNDRTH*TOL ) ) THEN * * Use a zero shift to avoid loss of relative accuracy diff --git a/lapack-netlib/SRC/dbdsvdx.f b/lapack-netlib/SRC/dbdsvdx.f index 4ccd4edad..4668a88f2 100644 --- a/lapack-netlib/SRC/dbdsvdx.f +++ b/lapack-netlib/SRC/dbdsvdx.f @@ -45,7 +45,7 @@ *> *> Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ] *> and superdiagonal E = [ e_1 e_2 ... e_N-1 ], DBDSVDX computes the -*> singular value decompositon of B through the eigenvalues and +*> singular value decomposition of B through the eigenvalues and *> eigenvectors of the N*2-by-N*2 tridiagonal matrix *> *> | 0 d_1 | diff --git a/lapack-netlib/SRC/dgebal.f b/lapack-netlib/SRC/dgebal.f index 821c7704a..f7b38b378 100644 --- a/lapack-netlib/SRC/dgebal.f +++ b/lapack-netlib/SRC/dgebal.f @@ -153,6 +153,9 @@ *> *> Modified by Tzu-Yi Chen, Computer Science Division, University of *> California at Berkeley, USA +*> +*> Refactored by Evert Provoost, Department of Computer Science, +*> KU Leuven, Belgium *> \endverbatim *> * ===================================================================== @@ -181,8 +184,8 @@ PARAMETER ( FACTOR = 0.95D+0 ) * .. * .. Local Scalars .. - LOGICAL NOCONV - INTEGER I, ICA, IEXC, IRA, J, K, L, M + LOGICAL NOCONV, CANSWAP + INTEGER I, ICA, IRA, J, K, L DOUBLE PRECISION C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2 * .. @@ -214,177 +217,192 @@ RETURN END IF * - K = 1 - L = N +* Quick returns. * - IF( N.EQ.0 ) - $ GO TO 210 + IF( N.EQ.0 ) THEN + ILO = 1 + IHI = 0 + RETURN + END IF * IF( LSAME( JOB, 'N' ) ) THEN - DO 10 I = 1, N + DO I = 1, N SCALE( I ) = ONE - 10 CONTINUE - GO TO 210 + END DO + ILO = 1 + IHI = N + RETURN END IF * - IF( LSAME( JOB, 'S' ) ) - $ GO TO 120 -* -* Permutation to isolate eigenvalues if possible -* - GO TO 50 -* -* Row and column exchange. -* - 20 CONTINUE - SCALE( M ) = J - IF( J.EQ.M ) - $ GO TO 30 -* - CALL DSWAP( L, A( 1, J ), 1, A( 1, M ), 1 ) - CALL DSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA ) -* - 30 CONTINUE - GO TO ( 40, 80 )IEXC -* -* Search for rows isolating an eigenvalue and push them down. -* - 40 CONTINUE - IF( L.EQ.1 ) - $ GO TO 210 - L = L - 1 -* - 50 CONTINUE - DO 70 J = L, 1, -1 +* Permutation to isolate eigenvalues if possible. * - DO 60 I = 1, L - IF( I.EQ.J ) - $ GO TO 60 - IF( A( J, I ).NE.ZERO ) - $ GO TO 70 - 60 CONTINUE -* - M = L - IEXC = 1 - GO TO 20 - 70 CONTINUE -* - GO TO 90 + K = 1 + L = N * -* Search for columns isolating an eigenvalue and push them left. + IF( .NOT.LSAME( JOB, 'S' ) ) THEN * - 80 CONTINUE - K = K + 1 +* Row and column exchange. * - 90 CONTINUE - DO 110 J = K, L + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for rows isolating an eigenvalue and push them down. +* + NOCONV = .FALSE. + DO I = L, 1, -1 + CANSWAP = .TRUE. + DO J = 1, L + IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( L ) = I + IF( I.NE.L ) THEN + CALL DSWAP( L, A( 1, I ), 1, A( 1, L ), 1 ) + CALL DSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA ) + END IF + NOCONV = .TRUE. +* + IF( L.EQ.1 ) THEN + ILO = 1 + IHI = 1 + RETURN + END IF +* + L = L - 1 + END IF + END DO +* + END DO + + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for columns isolating an eigenvalue and push them left. +* + NOCONV = .FALSE. + DO J = K, L + CANSWAP = .TRUE. + DO I = K, L + IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( K ) = J + IF( J.NE.K ) THEN + CALL DSWAP( L, A( 1, J ), 1, A( 1, K ), 1 ) + CALL DSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA ) + END IF + NOCONV = .TRUE. +* + K = K + 1 + END IF + END DO +* + END DO * - DO 100 I = K, L - IF( I.EQ.J ) - $ GO TO 100 - IF( A( I, J ).NE.ZERO ) - $ GO TO 110 - 100 CONTINUE + END IF * - M = K - IEXC = 2 - GO TO 20 - 110 CONTINUE +* Initialize SCALE for non-permuted submatrix. * - 120 CONTINUE - DO 130 I = K, L + DO I = K, L SCALE( I ) = ONE - 130 CONTINUE + END DO * - IF( LSAME( JOB, 'P' ) ) - $ GO TO 210 +* If we only had to permute, we are done. +* + IF( LSAME( JOB, 'P' ) ) THEN + ILO = K + IHI = L + RETURN + END IF * * Balance the submatrix in rows K to L. * -* Iterative loop for norm reduction +* Iterative loop for norm reduction. * SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 * - 140 CONTINUE - NOCONV = .FALSE. + NOCONV = .TRUE. + DO WHILE( NOCONV ) + NOCONV = .FALSE. * - DO 200 I = K, L + DO I = K, L * - C = DNRM2( L-K+1, A( K, I ), 1 ) - R = DNRM2( L-K+1, A( I, K ), LDA ) - ICA = IDAMAX( L, A( 1, I ), 1 ) - CA = ABS( A( ICA, I ) ) - IRA = IDAMAX( N-K+1, A( I, K ), LDA ) - RA = ABS( A( I, IRA+K-1 ) ) + C = DNRM2( L-K+1, A( K, I ), 1 ) + R = DNRM2( L-K+1, A( I, K ), LDA ) + ICA = IDAMAX( L, A( 1, I ), 1 ) + CA = ABS( A( ICA, I ) ) + IRA = IDAMAX( N-K+1, A( I, K ), LDA ) + RA = ABS( A( I, IRA+K-1 ) ) * -* Guard against zero C or R due to underflow. +* Guard against zero C or R due to underflow. * - IF( C.EQ.ZERO .OR. R.EQ.ZERO ) - $ GO TO 200 - G = R / SCLFAC - F = ONE - S = C + R - 160 CONTINUE - IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. - $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 - IF( DISNAN( C+F+CA+R+G+RA ) ) THEN + IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE * * Exit if NaN to avoid infinite loop * - INFO = -3 - CALL XERBLA( 'DGEBAL', -INFO ) - RETURN - END IF - F = F*SCLFAC - C = C*SCLFAC - CA = CA*SCLFAC - R = R / SCLFAC - G = G / SCLFAC - RA = RA / SCLFAC - GO TO 160 -* - 170 CONTINUE - G = C / SCLFAC - 180 CONTINUE - IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. - $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 - F = F / SCLFAC - C = C / SCLFAC - G = G / SCLFAC - CA = CA / SCLFAC - R = R*SCLFAC - RA = RA*SCLFAC - GO TO 180 -* -* Now balance. -* - 190 CONTINUE - IF( ( C+R ).GE.FACTOR*S ) - $ GO TO 200 - IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN - IF( F*SCALE( I ).LE.SFMIN1 ) - $ GO TO 200 - END IF - IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN - IF( SCALE( I ).GE.SFMAX1 / F ) - $ GO TO 200 - END IF - G = ONE / F - SCALE( I ) = SCALE( I )*F - NOCONV = .TRUE. -* - CALL DSCAL( N-K+1, G, A( I, K ), LDA ) - CALL DSCAL( L, F, A( 1, I ), 1 ) -* - 200 CONTINUE -* - IF( NOCONV ) - $ GO TO 140 + IF( DISNAN( C+CA+R+RA ) ) THEN + INFO = -3 + CALL XERBLA( 'DGEBAL', -INFO ) + RETURN + END IF +* + G = R / SCLFAC + F = ONE + S = C + R +* + DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND. + $ MIN( R, G, RA ).GT.SFMIN2 ) + F = F*SCLFAC + C = C*SCLFAC + CA = CA*SCLFAC + R = R / SCLFAC + G = G / SCLFAC + RA = RA / SCLFAC + END DO +* + G = C / SCLFAC +* + DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND. + $ MIN( F, C, G, CA ).GT.SFMIN2 ) + F = F / SCLFAC + C = C / SCLFAC + G = G / SCLFAC + CA = CA / SCLFAC + R = R*SCLFAC + RA = RA*SCLFAC + END DO +* +* Now balance. +* + IF( ( C+R ).GE.FACTOR*S ) CYCLE + IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN + IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE + END IF + IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN + IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE + END IF + G = ONE / F + SCALE( I ) = SCALE( I )*F + NOCONV = .TRUE. +* + CALL DSCAL( N-K+1, G, A( I, K ), LDA ) + CALL DSCAL( L, F, A( 1, I ), 1 ) +* + END DO +* + END DO * - 210 CONTINUE ILO = K IHI = L * diff --git a/lapack-netlib/SRC/dgebrd.f b/lapack-netlib/SRC/dgebrd.f index 0f0d1651a..ac11d48a0 100644 --- a/lapack-netlib/SRC/dgebrd.f +++ b/lapack-netlib/SRC/dgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL DGEBD2, DGEMM, DLABRD, XERBLA @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/dgecon.f b/lapack-netlib/SRC/dgecon.f index 1ad302ae3..a543eb03a 100644 --- a/lapack-netlib/SRC/dgecon.f +++ b/lapack-netlib/SRC/dgecon.f @@ -105,8 +105,15 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> =-5: if ANORM is NAN or negative. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> NaNs are illegal values for ANORM, and they propagate to +*> the output parameter RCOND. +*> Infinity is illegal for ANORM, and it propagates to the output +*> parameter RCOND as 0. +*> = 1: if RCOND = NaN, or +*> RCOND = Inf, or +*> the computed norm of the inverse of A is 0. +*> In the latter, RCOND = 0 is returned. *> \endverbatim * * Authors: @@ -117,7 +124,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gecon * * ===================================================================== SUBROUTINE DGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK, @@ -147,7 +154,7 @@ LOGICAL ONENRM CHARACTER NORMIN INTEGER IX, KASE, KASE1 - DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU + DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL * .. * .. Local Arrays .. INTEGER ISAVE( 3 ) @@ -165,6 +172,8 @@ INTRINSIC ABS, MAX * .. * .. Executable Statements .. +* + HUGEVAL = DLAMCH( 'Overflow' ) * * Test the input parameters. * @@ -176,7 +185,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN + ELSE IF( ANORM.LT.ZERO ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN @@ -192,6 +201,13 @@ RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN + ELSE IF( DISNAN( ANORM ) ) THEN + RCOND = ANORM + INFO = -5 + RETURN + ELSE IF( ANORM.GT.HUGEVAL ) THEN + INFO = -5 + RETURN END IF * SMLNUM = DLAMCH( 'Safe minimum' ) @@ -248,8 +264,17 @@ * * Compute the estimate of the reciprocal condition number. * - IF( AINVNM.NE.ZERO ) - $ RCOND = ( ONE / AINVNM ) / ANORM + IF( AINVNM.NE.ZERO ) THEN + RCOND = ( ONE / AINVNM ) / ANORM + ELSE + INFO = 1 + RETURN + END IF +* +* Check for NaNs and Infs +* + IF( DISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL ) + $ INFO = 1 * 20 CONTINUE RETURN diff --git a/lapack-netlib/SRC/dgedmd.c b/lapack-netlib/SRC/dgedmd.c new file mode 100644 index 000000000..66b4d5da6 --- /dev/null +++ b/lapack-netlib/SRC/dgedmd.c @@ -0,0 +1,1753 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0 */ +/* The state space dimension (the row dimension of X, Y). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshot pairs */ +/* (the number of columns of X and Y). */ +/* ..... */ +/* X (input/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, X contains the data snapshot matrix X. It is */ +/* assumed that the column norms of X are in the range of */ +/* the normalized floating point numbers. */ +/* < On exit, the leading K columns of X contain a POD basis, */ +/* i.e. the leading K left singular vectors of the input */ +/* data matrix X, U(:,1:K). All N columns of X contain all */ +/* left singular vectors of the input matrix X. */ +/* See the descriptions of K, Z and W. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= M */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (input/workspace/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, Y contains the data snapshot matrix Y */ +/* < On exit, */ +/* If JOBR == 'R', the leading K columns of Y contain */ +/* the residual vectors for the computed Ritz pairs. */ +/* See the description of RES. */ +/* If JOBR == 'N', Y contains the original input data, */ +/* scaled according to the value of JOBS. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= M */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1). */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ + +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the descriptions of TOL and K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the POD basis for the data snapshot */ +/* matrix X and the number of the computed Ritz pairs. */ +/* The value of K is determined according to the rule set */ +/* by the parameters NRNK and TOL. */ +/* See the descriptions of NRNK and TOL. */ +/* ..... */ +/* REIG (output) REAL(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of REIG contain */ +/* the real parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* See the descriptions of K, IMEIG, and Z. */ +/* ..... */ +/* IMEIG (output) REAL(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of IMEIG contain */ +/* the imaginary parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* The eigenvalues are determined as follows: */ +/* If IMEIG(i) == 0, then the corresponding eigenvalue is */ +/* real, LAMBDA(i) = REIG(i). */ +/* If IMEIG(i)>0, then the corresponding complex */ +/* conjugate pair of eigenvalues reads */ +/* LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) */ +/* LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */ +/* That is, complex conjugate pairs have consecutive */ +/* indices (i,i+1), with the positive imaginary part */ +/* listed first. */ +/* See the descriptions of K, REIG, and Z. */ +/* ..... */ +/* Z (workspace/output) REAL(KIND=WP) M-by-N array */ +/* If JOBZ =='V' then */ +/* Z contains real Ritz vectors as follows: */ +/* If IMEIG(i)=0, then Z(:,i) is an eigenvector of */ +/* the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */ +/* [Z(:,i) Z(:,i+1)] span an invariant subspace and */ +/* the Ritz values extracted from this subspace are */ +/* REIG(i) + sqrt(-1)*IMEIG(i) and */ +/* REIG(i) - sqrt(-1)*IMEIG(i). */ +/* The corresponding eigenvectors are */ +/* Z(:,i) + sqrt(-1)*Z(:,i+1) and */ +/* Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */ +/* || Z(:,i:i+1)||_F = 1. */ +/* If JOBZ == 'F', then the above descriptions hold for */ +/* the columns of X(:,1:K)*W(1:K,1:K), where the columns */ +/* of W(1:k,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */ +/* are similarly structured: If IMEIG(i) == 0 then */ +/* X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */ +/* then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */ +/* X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */ +/* are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */ +/* See the descriptions of REIG, IMEIG, X and W. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) N-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs. */ +/* If LAMBDA(i) is real, then */ +/* RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */ +/* If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */ +/* then */ +/* RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */ +/* where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */ +/* [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */ +/* It holds that */ +/* RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 */ +/* RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */ +/* where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) */ +/* ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) */ +/* See the description of REIG, IMEIG and Z. */ +/* ..... */ +/* B (output) REAL(KIND=WP) M-by-N array. */ +/* IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:M,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* If JOBF =='N', then B is not referenced. */ +/* See the descriptions of X, W, K. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= M */ +/* The leading dimension of the array B. */ +/* ..... */ +/* W (workspace/output) REAL(KIND=WP) N-by-N array */ +/* On exit, W(1:K,1:K) contains the K computed */ +/* eigenvectors of the matrix Rayleigh quotient (real and */ +/* imaginary parts for each complex conjugate pair of the */ +/* eigenvalues). The Ritz vectors (returned in Z) are the */ +/* product of X (containing a POD basis for the input */ +/* matrix X) and W. See the descriptions of K, S, X and Z. */ +/* W is also used as a workspace to temporarily store the */ +/* right singular vectors of X. */ +/* ..... */ +/* LDW (input) INTEGER, LDW >= N */ +/* The leading dimension of the array W. */ +/* ..... */ +/* S (workspace/output) REAL(KIND=WP) N-by-N array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by DGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N */ +/* The leading dimension of the array S. */ +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, WORK(1:N) contains the singular values of */ +/* X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */ +/* If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */ +/* scaling factor WORK(N+2)/WORK(N+1) used to scale X */ +/* and Y to avoid overflow in the SVD of X. */ +/* This may be of interest if the scaling option is off */ +/* and as many as possible smallest eigenvalues are */ +/* desired to the highest feasible accuracy. */ +/* If the call to DGEDMD is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. Hence, the */ +/* leng of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is calculated as follows: */ +/* If WHTSVD == 1 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */ +/* If JOBZ == 'N' then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */ +/* Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */ +/* workspace length of DGESVD. */ +/* If WHTSVD == 2 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */ +/* Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */ +/* minimal workspace length of DGESDD. */ +/* If WHTSVD == 3 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */ +/* Here LWORK_SVD = N+M+MAX(3*N+1, */ +/* MAX(1,3*N+M,5*N),MAX(1,N)) */ +/* is the minimal workspace length of DGESVDQ. */ +/* If WHTSVD == 4 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */ +/* Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */ +/* minimal workspace length of DGEJSV. */ +/* The above expressions are not simplified in order to */ +/* make the usage of WORK more transparent, and for */ +/* easier checking. In any case, LWORK >= 2. */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ +/* Local arrays */ +/* ~~~~~~~~~~~~ */ +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* ............................................................ */ + /* Parameter adjustments */ + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --reig; + --imeig; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + w_dim1 = *ldw; + w_offset = 1 + w_dim1 * 1; + w -= w_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; + +/* Test the input arguments */ + + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || lsame_(jobz, "N") || lsame_( + jobz, "F"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && ! wntvec) { + *info = -3; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -4; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -5; + } else if (*m < 0) { + *info = -6; + } else if (*n < 0 || *n > *m) { + *info = -7; + } else if (*ldx < *m) { + *info = -9; + } else if (*ldy < *m) { + *info = -11; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -12; + } else if (*tol < zero || *tol >= one) { + *info = -13; + } else if (*ldz < *m) { + *info = -18; + } else if ((wntref || wntex) && *ldb < *m) { + *info = -21; + } else if (*ldw < *n) { + *info = -23; + } else if (*lds < *n) { + *info = -25; + } + + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0) { +/* Quick return. All output except K is void. */ +/* INFO=1 signals the void input. */ +/* In case of a workspace query, the default */ +/* minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.; + work[2] = 2.; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlwork = f2cmax(2,*n); + olwork = f2cmax(2,*n); + iminwr = 1; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of DGESVD: */ +/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */ +/* Computing MAX */ + i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1, + i__2), i__2 = f2cmin(*m,*n) * 5; + mwrsvd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvd; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[ + b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, & + info1); +/* Computing MAX */ + i__1 = mwrsvd, i__2 = (integer) rdummy[0]; + lwrsvd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsvd; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 2) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of DGESDD: */ +/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */ +/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */ +/* IMINWR = 8*MIN(M,N) */ +/* Computing MAX */ + i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,* + n) << 2); + mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsdd; + mlwork = f2cmax(i__1,i__2); + iminwr = f2cmin(*m,*n) << 3; + if (lquery) { + dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], + ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], & + info1); +/* Computing MAX */ + i__1 = mwrsdd, i__2 = (integer) rdummy[0]; + lwrsdd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsdd; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 3) { +/* LWQP3 = 3*N+1 */ +/* LWORQ = MAX(N, 1) */ +/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */ +/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2) */ +/* MLWORK = N + MWRSVQ */ +/* IMINWR = M+N-1 */ + dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[ + 1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, & + iwork[1], liwork, rdummy, &c_n1, rdummy2, &c_n1, &info1); + iminwr = iwork[1]; + mwrsvq = (integer) rdummy[1]; +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0]; + mlwork = f2cmax(i__1,i__2); + if (lquery) { +/* Computing MAX */ + i__1 = mwrsvq, i__2 = (integer) rdummy[0]; + lwrsvq = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0]; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 4) { + *(unsigned char *)jsvopt = 'J'; +/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V' */ +/* Computing MAX */ + i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (* + n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1) + + *n * *n + 6; + mwrsvj = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvj; + mlwork = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 3, i__2 = *m + *n * 3; + iminwr = f2cmax(i__1,i__2); + if (lquery) { +/* Computing MAX */ + i__1 = olwork, i__2 = *n + mwrsvj; + olwork = f2cmax(i__1,i__2); + } +/* END SELECT */ + } + if (wntvec || wntex || lsame_(jobz, "F")) { + *(unsigned char *)jobzl = 'V'; + } else { + *(unsigned char *)jobzl = 'N'; + } +/* Workspace calculation to the DGEEV call */ + if (lsame_(jobzl, "V")) { +/* Computing MAX */ + i__1 = 1, i__2 = *n << 2; + mwrkev = f2cmax(i__1,i__2); + } else { +/* Computing MAX */ + i__1 = 1, i__2 = *n * 3; + mwrkev = f2cmax(i__1,i__2); + } +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrkev; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + dgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[ + w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1); +/* Computing MAX */ + i__1 = mwrkev, i__2 = (integer) rdummy[0]; + lwrkev = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrkev; + olwork = f2cmax(i__1,i__2); + } + + if (*liwork < iminwr && ! lquery) { + *info = -29; + } + if (*lwork < mlwork && ! lquery) { + *info = -27; + } + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGEDMD", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + work[1] = (doublereal) mlwork; + work[2] = (doublereal) olwork; + return 0; + } +/* ............................................................ */ + + ofl = dlamch_("O"); + small = dlamch_("S"); + badxy = FALSE_; + +/* <1> Optional scaling of the snapshots (columns of X, Y) */ +/* ========================================================== */ + if (sccolx) { +/* The columns of X will be normalized. */ +/* To prevent overflows, the column norms of X are */ +/* carefully computed using DLASSQ. */ + *k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DNRM2( M, X(1,i), 1 ) */ + scale = zero; + dlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum); + if (disnan_(&scale) || disnan_(&ssum)) { + *k = 0; + *info = -8; + i__2 = -(*info); + xerbla_("DGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of X(:,i) overflows. First, X(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */ +/* Next, the norm of X(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of X(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + d__1 = one / rootsc; + dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[ + i__ * x_dim1 + 1], m, &info2); + work[i__] = -scale * (rootsc / (doublereal) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + work[i__] = scale * rootsc; + dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[ + i__ * x_dim1 + 1], m, &info2); +/* X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC */ +/* LAPACK */ + } + } else { + work[i__] = zero; + ++(*k); + } + } + if (*k == *n) { +/* All columns of X are zero. Return error code -8. */ +/* (the 8th input variable had an illegal value) */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("DGEDMD", &i__1); + return 0; + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of Y. */ + if (work[i__] > zero) { + d__1 = one / work[i__]; + dscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1); +/* Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (work[i__] < zero) { + d__1 = -work[i__]; + d__2 = one / (doublereal) (*m); + dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * + y_dim1 + 1], m, &info2); +/* LAPACK CAL */ + } else if (y[idamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * + y_dim1] != zero) { +/* X(:,i) is zero vector. For consistency, */ +/* Y(:,i) should also be zero. If Y(:,i) is not */ +/* zero, then the data might be inconsistent or */ +/* corrupted. If JOBS == 'C', Y(:,i) is set to */ +/* zero and a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + if (lsame_(jobs, "C")) { + dscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1); + } +/* BLAS CALL */ + } + } + } + + if (sccoly) { +/* The columns of Y will be normalized. */ +/* To prevent overflows, the column norms of Y are */ +/* carefully computed using DLASSQ. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */ + scale = zero; + dlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum); + if (disnan_(&scale) || disnan_(&ssum)) { + *k = 0; + *info = -10; + i__2 = -(*info); + xerbla_("DGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of Y(:,i) overflows. First, Y(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */ +/* Next, the norm of Y(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of Y(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + d__1 = one / rootsc; + dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[ + i__ * y_dim1 + 1], m, &info2); + work[i__] = -scale * (rootsc / (doublereal) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + work[i__] = scale * rootsc; + dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[ + i__ * y_dim1 + 1], m, &info2); +/* Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* LAPACK */ + } + } else { + work[i__] = zero; + } + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of X. */ + if (work[i__] > zero) { + d__1 = one / work[i__]; + dscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1); +/* X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (work[i__] < zero) { + d__1 = -work[i__]; + d__2 = one / (doublereal) (*m); + dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * + x_dim1 + 1], m, &info2); +/* LAPACK CAL */ + } else if (x[idamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * + x_dim1] != zero) { +/* Y(:,i) is zero vector. If X(:,i) is not */ +/* zero, then a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + } + } + } + +/* <2> SVD of the data snapshot matrix X. */ +/* ===================================== */ +/* The left singular vectors are stored in the array X. */ +/* The right singular vectors are in the array W. */ +/* The array W will later on contain the eigenvectors */ +/* of a Rayleigh quotient. */ + numrnk = *n; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { + i__1 = *lwork - *n; + dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], + ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1); +/* LAPACK CAL */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 2) { + i__1 = *lwork - *n; + dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[ + w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1); +/* LAPACK CAL */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 3) { + i__1 = *lwork - *n - f2cmax(2,*m); + i__2 = f2cmax(2,*m); + dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], & + z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], + liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], & + i__2, &info1); +/* L */ + dlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK C */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 4) { + i__1 = *lwork - *n; + dgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, & + work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1] + , &i__1, &iwork[1], &info1); +/* LAPACK CALL */ + dlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK CALL */ + *(unsigned char *)t_or_n__ = 'N'; + xscl1 = work[*n + 1]; + xscl2 = work[*n + 2]; + if (xscl1 != xscl2) { +/* This is an exceptional situation. If the */ +/* data matrices are not scaled and the */ +/* largest singular value of X overflows. */ +/* In that case DGEJSV can return the SVD */ +/* in scaled form. The scaling factor can be used */ +/* to rescale the data (X and Y). */ + dlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], + ldy, &info2); + } +/* END SELECT */ + } + + if (info1 > 0) { +/* The SVD selected subroutine did not converge. */ +/* Return with an error code. */ + *info = 2; + return 0; + } + + if (work[1] == zero) { +/* The largest computed singular value of (scaled) */ +/* X is zero. Return error code -8 */ +/* (the 8th input variable had an illegal value). */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("DGEDMD", &i__1); + return 0; + } + +/* <3> Determine the numerical rank of the data */ +/* snapshots matrix X. This depends on the */ +/* parameters NRNK and TOL. */ +/* SELECT CASE ( NRNK ) */ + if (*nrnk == -1) { + *k = 1; + i__1 = numrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (work[i__] <= work[1] * *tol || work[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else if (*nrnk == -2) { + *k = 1; + i__1 = numrnk - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else { + *k = 1; + i__1 = *nrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (work[i__] <= small) { + myexit_(); + } + ++(*k); + } +/* END SELECT */ + } +/* Now, U = X(1:M,1:K) is the SVD/POD basis for the */ +/* snapshot data in the input matrix X. */ +/* <4> Compute the Rayleigh quotient S = U^T * A * U. */ +/* Depending on the requested outputs, the computation */ +/* is organized to compute additional auxiliary */ +/* matrices (for the residuals and refinements). */ + +/* In all formulas below, we need V_k*Sigma_k^(-1) */ +/* where either V_k is in W(1:N,1:K), or V_k^T is in */ +/* W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */ + if (lsame_(t_or_n__, "N")) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + d__1 = one / work[i__]; + dscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1); +/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i) ! INTRINSIC */ +/* BLAS CALL */ + } + } else { +/* This non-unit stride access is due to the fact */ +/* that DGESVD, DGESVDQ and DGESDD return the */ +/* transposed matrix of the right singular vectors. */ +/* DO i = 1, K */ +/* CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW ) ! BLAS CALL */ +/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N) ! INTRINSIC */ +/* END DO */ + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + work[*n + i__] = one / work[i__]; + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *k; + for (i__ = 1; i__ <= i__2; ++i__) { + w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1]; + } + } + } + + if (wntref) { + +/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */ +/* for computing the refined Ritz vectors */ +/* (optionally, outside DGEDMD). */ + dgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], + ldw, &zero, &z__[z_offset], ldz); +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRI */ +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRI */ + +/* At this point Z contains */ +/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */ +/* this is needed for computing the residuals. */ +/* This matrix is returned in the array B and */ +/* it can be used to compute refined Ritz vectors. */ +/* BLAS */ + dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); +/* B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC */ +/* BLAS CALL */ + dgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], + ldz, &zero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */ +/* At this point S = U^T * A * U is the Rayleigh quotient. */ +/* BLAS */ + } else { +/* A * U(:,1:K) is not explicitly needed and the */ +/* computation is organized differently. The Rayleigh */ +/* quotient is computed more efficiently. */ + dgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, + &zero, &z__[z_offset], ldz); +/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! IN */ +/* In the two DGEMM calls here, can use K for LDZ. */ +/* B */ + dgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset] + , ldw, &zero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */ +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRIN */ +/* At this point S = U^T * A * U is the Rayleigh quotient. */ +/* If the residuals are requested, save scaled V_k into Z. */ +/* Recall that V_k or V_k^T is stored in W. */ +/* BLAS */ + if (wntres || wntex) { + if (lsame_(t_or_n__, "N")) { + dlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz); + } else { + dlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz); + } + } + } + +/* <5> Compute the Ritz values and (if requested) the */ +/* right eigenvectors of the Rayleigh quotient. */ + + i__1 = *lwork - *n; + dgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset] + , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1); + +/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */ +/* quotient. Even in the case of complex spectrum, all */ +/* computation is done in real arithmetic. REIG and */ +/* IMEIG are the real and the imaginary parts of the */ +/* eigenvalues, so that the spectrum is given as */ +/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */ +/* are listed at consecutive positions. For such a */ +/* complex conjugate pair of the eigenvalues, the */ +/* corresponding eigenvectors are also a complex */ +/* conjugate pair with the real and imaginary parts */ +/* stored column-wise in W at the corresponding */ +/* consecutive column indices. See the description of Z. */ +/* Also, see the description of DGEEV. */ +/* LAPACK C */ + if (info1 > 0) { +/* DGEEV failed to compute the eigenvalues and */ +/* eigenvectors of the Rayleigh quotient. */ + *info = 3; + return 0; + } + +/* <6> Compute the eigenvectors (if requested) and, */ +/* the residuals (if requested). */ + + if (wntvec || wntex) { + if (wntres) { + if (wntref) { +/* Here, if the refinement is requested, we have */ +/* A*U(:,1:K) already computed and stored in Z. */ +/* For the residuals, need Y = A * U(:,1;K) * W. */ + dgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &y[y_offset], ldy); +/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC */ +/* This frees Z; Y contains A * U(:,1:K) * W. */ +/* BLAS CALL */ + } else { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[ + s_offset], lds, &zero, &z__[z_offset], ldz); +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ + dlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy); + if (wntex) { + dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); + } + } + } else if (wntex) { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], + lds, &zero, &b[b_offset], ldb); +/* The above call replaces the following two calls */ +/* that were used in the developing-testing phase. */ +/* CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */ +/* LDS, ZERO, Z, LDZ) */ +/* Save a copy of Z into B and free Z for holding */ +/* the Ritz vectors. */ +/* CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) */ + } + +/* Compute the real form of the Ritz vectors */ + if (wntvec) { + dgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], + ldw, &zero, &z__[z_offset], ldz); + } +/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC */ + +/* BLAS CALL */ + if (wntres) { + i__ = 1; + while(i__ <= *k) { + if (imeig[i__] == zero) { +/* have a real eigenvalue with real eigenvector */ + d__1 = -reig[i__]; + daxpy_(m, &d__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * + y_dim1 + 1], &c__1); +/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i) ! */ + + res[i__] = dnrm2_(m, &y[i__ * y_dim1 + 1], &c__1); + + ++i__; + } else { +/* Have a complex conjugate pair */ +/* REIG(i) +- sqrt(-1)*IMEIG(i). */ +/* Since all computation is done in real */ +/* arithmetic, the formula for the residual */ +/* is recast for real representation of the */ +/* complex conjugate eigenpair. See the */ +/* description of RES. */ + ab[0] = reig[i__]; + ab[1] = -imeig[i__]; + ab[2] = imeig[i__]; + ab[3] = reig[i__]; + d__1 = -one; + dgemm_("N", "N", m, &c__2, &c__2, &d__1, &z__[i__ * + z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * + y_dim1 + 1], ldy); +/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INT */ +/* BL */ + res[i__] = dlange_("F", m, &c__2, &y[i__ * y_dim1 + 1], + ldy, &work[*n + 1]); +/* LA */ + res[i__ + 1] = res[i__]; + i__ += 2; + } + } + } + } + + if (*whtsvd == 4) { + work[*n + 1] = xscl1; + work[*n + 2] = xscl2; + } + +/* Successful exit. */ + if (! badxy) { + *info = 0; + } else { +/* A warning on possible data inconsistency. */ +/* This should be a rare event. */ + *info = 4; + } +/* ............................................................ */ + return 0; +/* ...... */ +} /* dgedmd_ */ + diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90 new file mode 100644 index 000000000..15df48fe9 --- /dev/null +++ b/lapack-netlib/SRC/dgedmd.f90 @@ -0,0 +1,1206 @@ +!> \brief \b DGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) +! +!..... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 +!..... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> DGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, DGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, DGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... +! Arguments +! ========= +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) is CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim +!..... +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim +!..... +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim +!..... +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim +!..... +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: DGESVD (the QR SVD algorithm) +!> 2 :: DGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim +!..... +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim +!..... +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim +!..... +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim +!..... +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim +!..... +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim +!..... +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim +!..... +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1). +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim +!..... +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim +!..... +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim +!..... +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim +!..... +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim +!..... +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim +!..... +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim +!..... +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim +!..... +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim +!..... +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim +!..... +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim +!..... +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim +!..... +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by DGEEV. +!> See the description of K. +!> \endverbatim +!..... +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim +!..... +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to DGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> leng of work is at least 2. +!> See the description of LWORK. +!> \endverbatim +!..... +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of DGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of DGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of DGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of DGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim +!..... +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim +!..... +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim +!..... +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! +!............................................................. +!............................................................. + SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! +! Local scalars +! ~~~~~~~~~~~~~ + REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & + SSUM, XSCL1, XSCL2 + INTEGER :: i, j, IMINWR, INFO1, INFO2, & + LWRKEV, LWRSDD, LWRSVD, & + LWRSVQ, MLWORK, MWRKEV, MWRSDD, & + MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, & + OLWORK + LOGICAL :: BADXY, LQUERY, SCCOLX, SCCOLY, & + WNTEX, WNTREF, WNTRES, WNTVEC + CHARACTER :: JOBZL, T_OR_N + CHARACTER :: JSVOPT +! +! Local arrays +! ~~~~~~~~~~~~ + REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + REAL(KIND=WP) DLANGE, DLAMCH, DNRM2 + EXTERNAL DLANGE, DLAMCH, DNRM2, IDAMAX + INTEGER IDAMAX + LOGICAL DISNAN, LSAME + EXTERNAL DISNAN, LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL DAXPY, DGEMM, DSCAL + EXTERNAL DGEEV, DGEJSV, DGESDD, DGESVD, DGESVDQ, & + DLACPY, DLASCL, DLASSQ, XERBLA +! +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC DBLE, INT, MAX, SQRT +!............................................................ +! +! Test the input arguments +! + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C') + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N') & + .OR. LSAME(JOBZ,'F')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. (.NOT.WNTVEC) ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -4 + ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR. & + (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN + INFO = -5 + ELSE IF ( M < 0 ) THEN + INFO = -6 + ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN + INFO = -7 + ELSE IF ( LDX < M ) THEN + INFO = -9 + ELSE IF ( LDY < M ) THEN + INFO = -11 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -12 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -13 + ELSE IF ( LDZ < M ) THEN + INFO = -18 + ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN + INFO = -21 + ELSE IF ( LDW < N ) THEN + INFO = -23 + ELSE IF ( LDS < N ) THEN + INFO = -25 + END IF +! + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( N == 0 ) THEN + ! Quick return. All output except K is void. + ! INFO=1 signals the void input. + ! In case of a workspace query, the default + ! minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + MLWORK = MAX(2,N) + OLWORK = MAX(2,N) + IMINWR = 1 + SELECT CASE ( WHTSVD ) + CASE (1) + ! The following is specified as the minimal + ! length of WORK in the definition of DGESVD: + ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + MLWORK = MAX(MLWORK,N + MWRSVD) + IF ( LQUERY ) THEN + CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, & + B, LDB, W, LDW, RDUMMY, -1, INFO1 ) + LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) ) + OLWORK = MAX(OLWORK,N + LWRSVD) + END IF + CASE (2) + ! The following is specified as the minimal + ! length of WORK in the definition of DGESDD: + ! MWRSDD = 3*MIN(M,N)*MIN(M,N) + + ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) + ! IMINWR = 8*MIN(M,N) + MWRSDD = 3*MIN(M,N)*MIN(M,N) + & + MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) + MLWORK = MAX(MLWORK,N + MWRSDD) + IMINWR = 8*MIN(M,N) + IF ( LQUERY ) THEN + CALL DGESDD( 'O', M, N, X, LDX, WORK, B, & + LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 ) + LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) ) + OLWORK = MAX(OLWORK,N + LWRSDD) + END IF + CASE (3) + !LWQP3 = 3*N+1 + !LWORQ = MAX(N, 1) + !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2) + !MLWORK = N + MWRSVQ + !IMINWR = M+N-1 + CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, WORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, RDUMMY, & + -1, RDUMMY2, -1, INFO1 ) + IMINWR = IWORK(1) + MWRSVQ = INT(RDUMMY(2)) + MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1))) + IF ( LQUERY ) THEN + LWRSVQ = MAX( MWRSVQ, INT(RDUMMY(1)) ) + OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1))) + END IF + CASE (4) + JSVOPT = 'J' + !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V' + MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 ) + MLWORK = MAX(MLWORK,N+MWRSVJ) + IMINWR = MAX( 3, M+3*N ) + IF ( LQUERY ) THEN + OLWORK = MAX(OLWORK,N+MWRSVJ) + END IF + END SELECT + IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN + JOBZL = 'V' + ELSE + JOBZL = 'N' + END IF + ! Workspace calculation to the DGEEV call + IF ( LSAME(JOBZL,'V') ) THEN + MWRKEV = MAX( 1, 4*N ) + ELSE + MWRKEV = MAX( 1, 3*N ) + END IF + MLWORK = MAX(MLWORK,N+MWRKEV) + IF ( LQUERY ) THEN + CALL DGEEV( 'N', JOBZL, N, S, LDS, REIG, & + IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 ) + LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) ) + OLWORK = MAX( OLWORK, N+LWRKEV ) + END IF +! + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29 + IF ( LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27 + END IF +! + IF( INFO /= 0 ) THEN + CALL XERBLA( 'DGEDMD', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + WORK(1) = MLWORK + WORK(2) = OLWORK + RETURN + END IF +!............................................................ +! + OFL = DLAMCH('O') + SMALL = DLAMCH('S') + BADXY = .FALSE. +! +! <1> Optional scaling of the snapshots (columns of X, Y) +! ========================================================== + IF ( SCCOLX ) THEN + ! The columns of X will be normalized. + ! To prevent overflows, the column norms of X are + ! carefully computed using DLASSQ. + K = 0 + DO i = 1, N + !WORK(i) = DNRM2( M, X(1,i), 1 ) + SSUM = ONE + SCALE = ZERO + CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM ) + IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN + K = 0 + INFO = -8 + CALL XERBLA('DGEDMD',-INFO) + END IF + IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of X(:,i) overflows. First, X(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. +! Next, the norm of X(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of X(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, X(1,i), M, INFO2 ) + WORK(i) = - SCALE * ( ROOTSC / DBLE(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + WORK(i) = SCALE * ROOTSC + CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, & + X(1,i), M, INFO2 ) ! LAPACK CALL +! X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC + END IF + ELSE + WORK(i) = ZERO + K = K + 1 + END IF + END DO + IF ( K == N ) THEN + ! All columns of X are zero. Return error code -8. + ! (the 8th input variable had an illegal value) + K = 0 + INFO = -8 + CALL XERBLA('DGEDMD',-INFO) + RETURN + END IF + DO i = 1, N +! Now, apply the same scaling to the columns of Y. + IF ( WORK(i) > ZERO ) THEN + CALL DSCAL( M, ONE/WORK(i), Y(1,i), 1 ) ! BLAS CALL +! Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC + ELSE IF ( WORK(i) < ZERO ) THEN + CALL DLASCL( 'G', 0, 0, -WORK(i), & + ONE/DBLE(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL + ELSE IF ( Y(IDAMAX(M, Y(1,i),1),i ) & + /= ZERO ) THEN +! X(:,i) is zero vector. For consistency, +! Y(:,i) should also be zero. If Y(:,i) is not +! zero, then the data might be inconsistent or +! corrupted. If JOBS == 'C', Y(:,i) is set to +! zero and a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + IF ( LSAME(JOBS,'C')) & + CALL DSCAL( M, ZERO, Y(1,i), 1 ) ! BLAS CALL + END IF + END DO + END IF + ! + IF ( SCCOLY ) THEN + ! The columns of Y will be normalized. + ! To prevent overflows, the column norms of Y are + ! carefully computed using DLASSQ. + DO i = 1, N + !WORK(i) = DNRM2( M, Y(1,i), 1 ) + SSUM = ONE + SCALE = ZERO + CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM ) + IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN + K = 0 + INFO = -10 + CALL XERBLA('DGEDMD',-INFO) + END IF + IF ( SCALE /= ZERO .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of Y(:,i) overflows. First, Y(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. +! Next, the norm of Y(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of Y(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, Y(1,i), M, INFO2 ) + WORK(i) = - SCALE * ( ROOTSC / DBLE(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + WORK(i) = SCALE * ROOTSC + CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, & + Y(1,i), M, INFO2 ) ! LAPACK CALL +! Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC + END IF + ELSE + WORK(i) = ZERO + END IF + END DO + DO i = 1, N +! Now, apply the same scaling to the columns of X. + IF ( WORK(i) > ZERO ) THEN + CALL DSCAL( M, ONE/WORK(i), X(1,i), 1 ) ! BLAS CALL +! X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC + ELSE IF ( WORK(i) < ZERO ) THEN + CALL DLASCL( 'G', 0, 0, -WORK(i), & + ONE/DBLE(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL + ELSE IF ( X(IDAMAX(M, X(1,i),1),i ) & + /= ZERO ) THEN +! Y(:,i) is zero vector. If X(:,i) is not +! zero, then a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + END IF + END DO + END IF +! +! <2> SVD of the data snapshot matrix X. +! ===================================== +! The left singular vectors are stored in the array X. +! The right singular vectors are in the array W. +! The array W will later on contain the eigenvectors +! of a Rayleigh quotient. + NUMRNK = N + SELECT CASE ( WHTSVD ) + CASE (1) + CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, B, & + LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL + T_OR_N = 'T' + CASE (2) + CALL DGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, & + LDW, WORK(N+1), LWORK-N, IWORK, INFO1 ) ! LAPACK CALL + T_OR_N = 'T' + CASE (3) + CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, WORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),& + LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1) ! LAPACK CALL + CALL DLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'T' + CASE (4) + CALL DGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, & + N, X, LDX, WORK, Z, LDZ, W, LDW, & + WORK(N+1), LWORK-N, IWORK, INFO1 ) ! LAPACK CALL + CALL DLACPY( 'A', M, N, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'N' + XSCL1 = WORK(N+1) + XSCL2 = WORK(N+2) + IF ( XSCL1 /= XSCL2 ) THEN + ! This is an exceptional situation. If the + ! data matrices are not scaled and the + ! largest singular value of X overflows. + ! In that case DGEJSV can return the SVD + ! in scaled form. The scaling factor can be used + ! to rescale the data (X and Y). + CALL DLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2 ) + END IF + END SELECT +! + IF ( INFO1 > 0 ) THEN + ! The SVD selected subroutine did not converge. + ! Return with an error code. + INFO = 2 + RETURN + END IF +! + IF ( WORK(1) == ZERO ) THEN + ! The largest computed singular value of (scaled) + ! X is zero. Return error code -8 + ! (the 8th input variable had an illegal value). + K = 0 + INFO = -8 + CALL XERBLA('DGEDMD',-INFO) + RETURN + END IF +! + !<3> Determine the numerical rank of the data + ! snapshots matrix X. This depends on the + ! parameters NRNK and TOL. + + SELECT CASE ( NRNK ) + CASE ( -1 ) + K = 1 + DO i = 2, NUMRNK + IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. & + ( WORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE ( -2 ) + K = 1 + DO i = 1, NUMRNK-1 + IF ( ( WORK(i+1) <= WORK(i)*TOL ) .OR. & + ( WORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE DEFAULT + K = 1 + DO i = 2, NRNK + IF ( WORK(i) <= SMALL ) EXIT + K = K + 1 + END DO + END SELECT + ! Now, U = X(1:M,1:K) is the SVD/POD basis for the + ! snapshot data in the input matrix X. + + !<4> Compute the Rayleigh quotient S = U^T * A * U. + ! Depending on the requested outputs, the computation + ! is organized to compute additional auxiliary + ! matrices (for the residuals and refinements). + ! + ! In all formulas below, we need V_k*Sigma_k^(-1) + ! where either V_k is in W(1:N,1:K), or V_k^T is in + ! W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). + IF ( LSAME(T_OR_N, 'N') ) THEN + DO i = 1, K + CALL DSCAL( N, ONE/WORK(i), W(1,i), 1 ) ! BLAS CALL + ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i) ! INTRINSIC + END DO + ELSE + ! This non-unit stride access is due to the fact + ! that DGESVD, DGESVDQ and DGESDD return the + ! transposed matrix of the right singular vectors. + !DO i = 1, K + ! CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW ) ! BLAS CALL + ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N) ! INTRINSIC + !END DO + DO i = 1, K + WORK(N+i) = ONE/WORK(i) + END DO + DO j = 1, N + DO i = 1, K + W(i,j) = (WORK(N+i))*W(i,j) + END DO + END DO + END IF +! + IF ( WNTREF ) THEN + ! + ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) + ! for computing the refined Ritz vectors + ! (optionally, outside DGEDMD). + CALL DGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, & + LDW, ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRINSIC, for T_OR_N=='N' + ! + ! At this point Z contains + ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and + ! this is needed for computing the residuals. + ! This matrix is returned in the array B and + ! it can be used to compute refined Ritz vectors. + CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) ! BLAS CALL + ! B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC + + CALL DGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, & + LDZ, ZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC + ! At this point S = U^T * A * U is the Rayleigh quotient. + ELSE + ! A * U(:,1:K) is not explicitly needed and the + ! computation is organized differently. The Rayleigh + ! quotient is computed more efficiently. + CALL DGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, & + ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! INTRINSIC + ! In the two DGEMM calls here, can use K for LDZ. + CALL DGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, & + LDW, ZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRINSIC, for T_OR_N=='N' + ! At this point S = U^T * A * U is the Rayleigh quotient. + ! If the residuals are requested, save scaled V_k into Z. + ! Recall that V_k or V_k^T is stored in W. + IF ( WNTRES .OR. WNTEX ) THEN + IF ( LSAME(T_OR_N, 'N') ) THEN + CALL DLACPY( 'A', N, K, W, LDW, Z, LDZ ) + ELSE + CALL DLACPY( 'A', K, N, W, LDW, Z, LDZ ) + END IF + END IF + END IF +! + !<5> Compute the Ritz values and (if requested) the + ! right eigenvectors of the Rayleigh quotient. + ! + CALL DGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, & + LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL + ! + ! W(1:K,1:K) contains the eigenvectors of the Rayleigh + ! quotient. Even in the case of complex spectrum, all + ! computation is done in real arithmetic. REIG and + ! IMEIG are the real and the imaginary parts of the + ! eigenvalues, so that the spectrum is given as + ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs + ! are listed at consecutive positions. For such a + ! complex conjugate pair of the eigenvalues, the + ! corresponding eigenvectors are also a complex + ! conjugate pair with the real and imaginary parts + ! stored column-wise in W at the corresponding + ! consecutive column indices. See the description of Z. + ! Also, see the description of DGEEV. + IF ( INFO1 > 0 ) THEN + ! DGEEV failed to compute the eigenvalues and + ! eigenvectors of the Rayleigh quotient. + INFO = 3 + RETURN + END IF +! + ! <6> Compute the eigenvectors (if requested) and, + ! the residuals (if requested). + ! + IF ( WNTVEC .OR. WNTEX ) THEN + IF ( WNTRES ) THEN + IF ( WNTREF ) THEN + ! Here, if the refinement is requested, we have + ! A*U(:,1:K) already computed and stored in Z. + ! For the residuals, need Y = A * U(:,1;K) * W. + CALL DGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, & + LDW, ZERO, Y, LDY ) ! BLAS CALL + ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC + ! This frees Z; Y contains A * U(:,1:K) * W. + ELSE + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, & + W, LDW, ZERO, S, LDS) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + LDS, ZERO, Z, LDZ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + CALL DLACPY( 'A', M, K, Z, LDZ, Y, LDY ) + IF ( WNTEX ) CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF + ELSE IF ( WNTEX ) THEN + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, & + W, LDW, ZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + LDS, ZERO, B, LDB ) + ! The above call replaces the following two calls + ! that were used in the developing-testing phase. + ! CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + ! LDS, ZERO, Z, LDZ) + ! Save a copy of Z into B and free Z for holding + ! the Ritz vectors. + ! CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF +! + ! Compute the real form of the Ritz vectors + IF ( WNTVEC ) CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC +! + IF ( WNTRES ) THEN + i = 1 + DO WHILE ( i <= K ) + IF ( IMEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 ) ! BLAS CALL + ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i) ! INTRINSIC + RES(i) = DNRM2( M, Y(1,i), 1) ! BLAS CALL + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IMEIG(i) + AB(1,2) = IMEIG(i) + AB(2,2) = REIG(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + LDZ, AB, 2, ONE, Y(1,i), LDY ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES(i) = DLANGE( 'F', M, 2, Y(1,i), LDY, & + WORK(N+1) ) ! LAPACK CALL + RES(i+1) = RES(i) + i = i + 2 + END IF + END DO + END IF + END IF +! + IF ( WHTSVD == 4 ) THEN + WORK(N+1) = XSCL1 + WORK(N+2) = XSCL2 + END IF +! +! Successful exit. + IF ( .NOT. BADXY ) THEN + INFO = 0 + ELSE + ! A warning on possible data inconsistency. + ! This should be a rare event. + INFO = 4 + END IF +!............................................................ + RETURN +! ...... + END SUBROUTINE DGEDMD diff --git a/lapack-netlib/SRC/dgedmdq.c b/lapack-netlib/SRC/dgedmdq.c new file mode 100644 index 000000000..a743a3156 --- /dev/null +++ b/lapack-netlib/SRC/dgedmdq.c @@ -0,0 +1,1300 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0 */ +/* The state space dimension (the number of rows of F). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshots from a single trajectory, */ +/* taken at equidistant discrete times. This is the */ +/* number of columns of F. */ +/* ..... */ +/* F (input/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, */ +/* the columns of F are the sequence of data snapshots */ +/* from a single trajectory, taken at equidistant discrete */ +/* times. It is assumed that the column norms of F are */ +/* in the range of the normalized floating point numbers. */ +/* < On exit, */ +/* If JOBQ == 'Q', the array F contains the orthogonal */ +/* matrix/factor of the QR factorization of the initial */ +/* data snapshots matrix F. See the description of JOBQ. */ +/* If JOBQ == 'N', the entries in F strictly below the main */ +/* diagonal contain, column-wise, the information on the */ +/* Householder vectors, as returned by DGEQRF. The */ +/* remaining information to restore the orthogonal matrix */ +/* of the initial QR factorization is stored in WORK(1:N). */ +/* See the description of WORK. */ +/* ..... */ +/* LDF (input) INTEGER, LDF >= M */ +/* The leading dimension of the array F. */ +/* ..... */ +/* X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* X is used as workspace to hold representations of the */ +/* leading N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, the leading K columns of X contain the leading */ +/* K left singular vectors of the above described content */ +/* of X. To lift them to the space of the left singular */ +/* vectors U(:,1:K)of the input data, pre-multiply with the */ +/* Q factor from the initial QR factorization. */ +/* See the descriptions of F, K, V and Z. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= N */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* Y is used as workspace to hold representations of the */ +/* trailing N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, */ +/* If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */ +/* triangular factor from the QR factorization of the data */ +/* snapshot matrix F. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= N */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N-1 :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the description of K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the SVD/POD basis for the leading N-1 */ +/* data snapshots (columns of F) and the number of the */ +/* computed Ritz pairs. The value of K is determined */ +/* according to the rule set by the parameters NRNK and */ +/* TOL. See the descriptions of NRNK and TOL. */ +/* ..... */ +/* REIG (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K<=N) entries of REIG contain */ +/* the real parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* See the descriptions of K, IMEIG, Z. */ +/* ..... */ +/* IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K0, then the corresponding complex */ +/* conjugate pair of eigenvalues reads */ +/* LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) */ +/* LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */ +/* That is, complex conjugate pairs have consequtive */ +/* indices (i,i+1), with the positive imaginary part */ +/* listed first. */ +/* See the descriptions of K, REIG, Z. */ +/* ..... */ +/* Z (workspace/output) REAL(KIND=WP) M-by-(N-1) array */ +/* If JOBZ =='V' then */ +/* Z contains real Ritz vectors as follows: */ +/* If IMEIG(i)=0, then Z(:,i) is an eigenvector of */ +/* the i-th Ritz value. */ +/* If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */ +/* [Z(:,i) Z(:,i+1)] span an invariant subspace and */ +/* the Ritz values extracted from this subspace are */ +/* REIG(i) + sqrt(-1)*IMEIG(i) and */ +/* REIG(i) - sqrt(-1)*IMEIG(i). */ +/* The corresponding eigenvectors are */ +/* Z(:,i) + sqrt(-1)*Z(:,i+1) and */ +/* Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */ +/* If JOBZ == 'F', then the above descriptions hold for */ +/* the columns of Z*V, where the columns of V are the */ +/* eigenvectors of the K-by-K Rayleigh quotient, and Z is */ +/* orthonormal. The columns of V are similarly structured: */ +/* If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */ +/* IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */ +/* Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */ +/* are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */ +/* See the descriptions of REIG, IMEIG, X and V. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs. */ +/* If LAMBDA(i) is real, then */ +/* RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */ +/* If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */ +/* then */ +/* RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */ +/* where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */ +/* [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */ +/* It holds that */ +/* RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 */ +/* RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */ +/* where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) */ +/* ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) */ +/* See the description of Z. */ +/* ..... */ +/* B (output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array. */ +/* IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:N,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* In both cases, the content of B can be lifted to the */ +/* original dimension of the input data by pre-multiplying */ +/* with the Q factor from the initial QR factorization. */ +/* Here A denotes a compression of the underlying operator. */ +/* See the descriptions of F and X. */ +/* If JOBF =='N', then B is not referenced. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= MIN(M,N) */ +/* The leading dimension of the array B. */ +/* ..... */ +/* V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */ +/* On exit, V(1:K,1:K) contains the K eigenvectors of */ +/* the Rayleigh quotient. The eigenvectors of a complex */ +/* conjugate pair of eigenvalues are returned in real form */ +/* as explained in the description of Z. The Ritz vectors */ +/* (returned in Z) are the product of X and V; see */ +/* the descriptions of X and Z. */ +/* ..... */ +/* LDV (input) INTEGER, LDV >= N-1 */ +/* The leading dimension of the array V. */ +/* ..... */ +/* S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by DGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N-1 */ +/* The leading dimension of the array S. */ +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* WORK(1:MIN(M,N)) contains the scalar factors of the */ +/* elementary reflectors as returned by DGEQRF of the */ +/* M-by-N input matrix F. */ +/* WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */ +/* the input submatrix F(1:M,1:N-1). */ +/* If the call to DGEDMDQ is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. Hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is calculated as follows: */ +/* Let MLWQR = N (minimal workspace for DGEQRF[M,N]) */ +/* MLWDMD = minimal workspace for DGEDMD (see the */ +/* description of LWORK in DGEDMD) for */ +/* snapshots of dimensions MIN(M,N)-by-(N-1) */ +/* MLWMQR = N (minimal workspace for */ +/* DORMQR['L','N',M,N,N]) */ +/* MLWGQR = N (minimal workspace for DORGQR[M,N,N]) */ +/* Then */ +/* LWORK = MAX(N+MLWQR, N+MLWDMD) */ +/* is updated as follows: */ +/* if JOBZ == 'V' or JOBZ == 'F' THEN */ +/* LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR ) */ +/* if JOBQ == 'Q' THEN */ +/* LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR) */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* Let M1=MIN(M,N), N1=N-1. Then */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ + +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* Local array */ +/* ~~~~~~~~~~~ */ + +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ + +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* External subroutines */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* .......................................................... */ + /* Parameter adjustments */ + f_dim1 = *ldf; + f_offset = 1 + f_dim1 * 1; + f -= f_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --reig; + --imeig; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + v_dim1 = *ldv; + v_offset = 1 + v_dim1 * 1; + v -= v_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --work; + --iwork; + + /* Function Body */ + zero = 0.f; + one = 1.f; + +/* Test the input arguments */ + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntvcf = lsame_(jobz, "F"); + wntvcq = lsame_(jobz, "Q"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + wantq = lsame_(jobq, "Q"); + wnttrf = lsame_(jobt, "R"); + minmn = f2cmin(*m,*n); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && lsame_(jobz, "N")) { + *info = -3; + } else if (! (wantq || lsame_(jobq, "N"))) { + *info = -4; + } else if (! (wnttrf || lsame_(jobt, "N"))) { + *info = -5; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -6; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -7; + } else if (*m < 0) { + *info = -8; + } else if (*n < 0 || *n > *m + 1) { + *info = -9; + } else if (*ldf < *m) { + *info = -11; + } else if (*ldx < minmn) { + *info = -13; + } else if (*ldy < minmn) { + *info = -15; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -16; + } else if (*tol < zero || *tol >= one) { + *info = -17; + } else if (*ldz < *m) { + *info = -22; + } else if ((wntref || wntex) && *ldb < minmn) { + *info = -25; + } else if (*ldv < *n - 1) { + *info = -27; + } else if (*lds < *n - 1) { + *info = -29; + } + + if (wntvec || wntvcf || wntvcq) { + *(unsigned char *)jobvl = 'V'; + } else { + *(unsigned char *)jobvl = 'N'; + } + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0 || *n == 1) { +/* All output except K is void. INFO=1 signals */ +/* the void input. In case of a workspace query, */ +/* the minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.; + work[2] = 2.; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlwqr = f2cmax(1,*n); +/* Minimal workspace length for DGEQRF. */ + mlwork = minmn + mlwqr; + if (lquery) { + dgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1); + olwqr = (integer) rdummy[0]; + olwork = f2cmin(*m,*n) + olwqr; + } + i__1 = *n - 1; + dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], + ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], & + z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], + ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, & + info1); + mlwdmd = (integer) work[1]; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwdmd; + mlwork = f2cmax(i__1,i__2); + iminwr = iwork[1]; + if (lquery) { + olwdmd = (integer) work[2]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwdmd; + olwork = f2cmax(i__1,i__2); + } + if (wntvec || wntvcf) { + mlwmqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + dormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], & + z__[z_offset], ldz, &work[1], &c_n1, &info1); + olwmqr = (integer) work[1]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr; + olwork = f2cmax(i__1,i__2); + } + } + if (wantq) { + mlwgqr = *n; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[ + 1], &c_n1, &info1); + olwgqr = (integer) work[1]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr; + olwork = f2cmax(i__1,i__2); + } + } + iminwr = f2cmax(1,iminwr); + mlwork = f2cmax(2,mlwork); + if (*lwork < mlwork && ! lquery) { + *info = -31; + } + if (*liwork < iminwr && ! lquery) { + *info = -33; + } + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGEDMDQ", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + work[1] = (doublereal) mlwork; + work[2] = (doublereal) olwork; + return 0; + } +/* ..... */ +/* Initial QR factorization that is used to represent the */ +/* snapshots as elements of lower dimensional subspace. */ +/* For large scale computation with M >>N , at this place */ +/* one can use an out of core QRF. */ + + i__1 = *lwork - minmn; + dgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, & + info1); + +/* Define X and Y as the snapshots representations in the */ +/* orthogonal basis computed in the QR factorization. */ +/* X corresponds to the leading N-1 and Y to the trailing */ +/* N-1 snapshots. */ + i__1 = *n - 1; + dlaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx); + i__1 = *n - 1; + dlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx); + i__1 = *n - 1; + dlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy); + if (*m >= 3) { + i__1 = minmn - 2; + i__2 = *n - 2; + dlaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy); + } + +/* Compute the DMD of the projected snapshot pairs (X,Y) */ + i__1 = *n - 1; + i__2 = *lwork - minmn; + dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx, + &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[ + z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[ + s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, & + info1); + if (info1 == 2 || info1 == 3) { +/* Return with error code. See DGEDMD for details. */ + *info = info1; + return 0; + } else { + *info = info1; + } + +/* The Ritz vectors (Koopman modes) can be explicitly */ +/* formed or returned in factored form. */ + if (wntvec) { +/* Compute the eigenvectors explicitly. */ + if (*m > minmn) { + i__1 = *m - minmn; + dlaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], + ldz); + } + i__1 = *lwork - (minmn + *n - 1); + dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[ + z_offset], ldz, &work[minmn + *n], &i__1, &info1); + } else if (wntvcf) { +/* Return the Ritz vectors (eigenvectors) in factored */ +/* form Z*V, where Z contains orthonormal matrix (the */ +/* product of Q from the initial QR factorization and */ +/* the SVD/POD_basis returned by DGEDMD in X) and the */ +/* second factor (the eigenvectors of the Rayleigh */ +/* quotient) is in the array V, as returned by DGEDMD. */ + dlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz); + if (*m > *n) { + i__1 = *m - *n; + dlaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz); + } + i__1 = *lwork - (minmn + *n - 1); + dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[ + z_offset], ldz, &work[minmn + *n], &i__1, &info1); + } + +/* Some optional output variables: */ + +/* The upper triangular factor R in the initial QR */ +/* factorization is optionally returned in the array Y. */ +/* This is useful if this call to DGEDMDQ is to be */ +/* followed by a streaming DMD that is implemented in a */ +/* QR compressed form. */ + if (wnttrf) { +/* Return the upper triangular R in Y */ + dlaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy); + dlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy); + } + +/* The orthonormal/orthogonal factor Q in the initial QR */ +/* factorization is optionally returned in the array F. */ +/* Same as with the triangular factor above, this is */ +/* useful in a streaming DMD. */ + if (wantq) { +/* Q overwrites F */ + i__1 = *lwork - (minmn + *n - 1); + dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + + *n], &i__1, &info1); + } + + return 0; + +} /* dgedmdq_ */ + diff --git a/lapack-netlib/SRC/dgedmdq.f90 b/lapack-netlib/SRC/dgedmdq.f90 new file mode 100644 index 000000000..bedfba472 --- /dev/null +++ b/lapack-netlib/SRC/dgedmdq.f90 @@ -0,0 +1,704 @@ +SUBROUTINE DGEDMDQ( JOBS, JOBZ, JOBR, JOBQ, JOBT, JOBF, & + WHTSVD, M, N, F, LDF, X, LDX, Y, & + LDY, NRNK, TOL, K, REIG, IMEIG, & + Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, WORK, LWORK, IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBQ, & + JOBT, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDF, LDX, & + LDY, NRNK, LDZ, LDB, LDV, & + LDS, LWORK, LIWORK + INTEGER, INTENT(OUT) :: INFO, K + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*) + REAL(KIND=WP), INTENT(OUT) :: X(LDX,*), Y(LDY,*), & + Z(LDZ,*), B(LDB,*), & + V(LDV,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!..... +! Purpose +! ======= +! DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices, using a QR factorization +! based compression of the data. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, DGEDMDQ computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, DGEDMDQ returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office. +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. The data snapshots are the columns +! of F. The leading N-1 columns of F are denoted X and the +! trailing N-1 columns are denoted Y. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Z*V, where Z +! is orthonormal and V contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of F, V, Z. +! 'Q' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Q*Z, where Z +! contains the eigenvectors of the compression of the +! underlying discretized operator onto the span of +! the data snapshots. See the descriptions of F, V, Z. +! Q is from the initial QR factorization. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will +! be computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBQ (input) CHARACTER*1 +! Specifies whether to explicitly compute and return the +! orthogonal matrix from the QR factorization. +! 'Q' :: The matrix Q of the QR factorization of the data +! snapshot matrix is computed and stored in the +! array F. See the description of F. +! 'N' :: The matrix Q is not explicitly computed. +!..... +! JOBT (input) CHARACTER*1 +! Specifies whether to return the upper triangular factor +! from the QR factorization. +! 'R' :: The matrix R of the QR factorization of the data +! snapshot matrix F is returned in the array Y. +! See the description of Y and Further details. +! 'N' :: The matrix R is not returned. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +! To be useful on exit, this option needs JOBQ='Q'. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: DGESVD (the QR SVD algorithm) +! 2 :: DGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M >= 0 +! The state space dimension (the number of rows of F). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshots from a single trajectory, +! taken at equidistant discrete times. This is the +! number of columns of F. +!..... +! F (input/output) REAL(KIND=WP) M-by-N array +! > On entry, +! the columns of F are the sequence of data snapshots +! from a single trajectory, taken at equidistant discrete +! times. It is assumed that the column norms of F are +! in the range of the normalized floating point numbers. +! < On exit, +! If JOBQ == 'Q', the array F contains the orthogonal +! matrix/factor of the QR factorization of the initial +! data snapshots matrix F. See the description of JOBQ. +! If JOBQ == 'N', the entries in F strictly below the main +! diagonal contain, column-wise, the information on the +! Householder vectors, as returned by DGEQRF. The +! remaining information to restore the orthogonal matrix +! of the initial QR factorization is stored in WORK(1:N). +! See the description of WORK. +!..... +! LDF (input) INTEGER, LDF >= M +! The leading dimension of the array F. +!..... +! X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array +! X is used as workspace to hold representations of the +! leading N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, the leading K columns of X contain the leading +! K left singular vectors of the above described content +! of X. To lift them to the space of the left singular +! vectors U(:,1:K)of the input data, pre-multiply with the +! Q factor from the initial QR factorization. +! See the descriptions of F, K, V and Z. +!..... +! LDX (input) INTEGER, LDX >= N +! The leading dimension of the array X. +!..... +! Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array +! Y is used as workspace to hold representations of the +! trailing N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, +! If JOBT == 'R', Y contains the MIN(M,N)-by-N upper +! triangular factor from the QR factorization of the data +! snapshot matrix F. +!..... +! LDY (input) INTEGER , LDY >= N +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N-1 :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the description of K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the SVD/POD basis for the leading N-1 +! data snapshots (columns of F) and the number of the +! computed Ritz pairs. The value of K is determined +! according to the rule set by the parameters NRNK and +! TOL. See the descriptions of NRNK and TOL. +!..... +! REIG (output) REAL(KIND=WP) (N-1)-by-1 array +! The leading K (K<=N) entries of REIG contain +! the real parts of the computed eigenvalues +! REIG(1:K) + sqrt(-1)*IMEIG(1:K). +! See the descriptions of K, IMEIG, Z. +!..... +! IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array +! The leading K (K0, then the corresponding complex +! conjugate pair of eigenvalues reads +! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +! That is, complex conjugate pairs have consequtive +! indices (i,i+1), with the positive imaginary part +! listed first. +! See the descriptions of K, REIG, Z. +!..... +! Z (workspace/output) REAL(KIND=WP) M-by-(N-1) array +! If JOBZ =='V' then +! Z contains real Ritz vectors as follows: +! If IMEIG(i)=0, then Z(:,i) is an eigenvector of +! the i-th Ritz value. +! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +! [Z(:,i) Z(:,i+1)] span an invariant subspace and +! the Ritz values extracted from this subspace are +! REIG(i) + sqrt(-1)*IMEIG(i) and +! REIG(i) - sqrt(-1)*IMEIG(i). +! The corresponding eigenvectors are +! Z(:,i) + sqrt(-1)*Z(:,i+1) and +! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +! If JOBZ == 'F', then the above descriptions hold for +! the columns of Z*V, where the columns of V are the +! eigenvectors of the K-by-K Rayleigh quotient, and Z is +! orthonormal. The columns of V are similarly structured: +! If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if +! IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and +! Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) +! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +! See the descriptions of REIG, IMEIG, X and V. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) (N-1)-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs. +! If LAMBDA(i) is real, then +! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +! then +! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +! It holds that +! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +! See the description of Z. +!..... +! B (output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array. +! IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:N,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! In both cases, the content of B can be lifted to the +! original dimension of the input data by pre-multiplying +! with the Q factor from the initial QR factorization. +! Here A denotes a compression of the underlying operator. +! See the descriptions of F and X. +! If JOBF =='N', then B is not referenced. +!..... +! LDB (input) INTEGER, LDB >= MIN(M,N) +! The leading dimension of the array B. +!..... +! V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array +! On exit, V(1:K,1:K) contains the K eigenvectors of +! the Rayleigh quotient. The eigenvectors of a complex +! conjugate pair of eigenvalues are returned in real form +! as explained in the description of Z. The Ritz vectors +! (returned in Z) are the product of X and V; see +! the descriptions of X and Z. +!..... +! LDV (input) INTEGER, LDV >= N-1 +! The leading dimension of the array V. +!..... +! S (output) REAL(KIND=WP) (N-1)-by-(N-1) array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by DGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N-1 +! The leading dimension of the array S. +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, +! WORK(1:MIN(M,N)) contains the scalar factors of the +! elementary reflectors as returned by DGEQRF of the +! M-by-N input matrix F. +! WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of +! the input submatrix F(1:M,1:N-1). +! If the call to DGEDMDQ is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. Hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is calculated as follows: +! Let MLWQR = N (minimal workspace for DGEQRF[M,N]) +! MLWDMD = minimal workspace for DGEDMD (see the +! description of LWORK in DGEDMD) for +! snapshots of dimensions MIN(M,N)-by-(N-1) +! MLWMQR = N (minimal workspace for +! DORMQR['L','N',M,N,N]) +! MLWGQR = N (minimal workspace for DORGQR[M,N,N]) +! Then +! LWORK = MAX(N+MLWQR, N+MLWDMD) +! is updated as follows: +! if JOBZ == 'V' or JOBZ == 'F' THEN +! LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR ) +! if JOBQ == 'Q' THEN +! LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR) +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! Let M1=MIN(M,N), N1=N-1. Then +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! +! Local scalars +! ~~~~~~~~~~~~~ + INTEGER :: IMINWR, INFO1, MLWDMD, MLWGQR, & + MLWMQR, MLWORK, MLWQR, MINMN, & + OLWDMD, OLWGQR, OLWMQR, OLWORK, & + OLWQR + LOGICAL :: LQUERY, SCCOLX, SCCOLY, WANTQ, & + WNTTRF, WNTRES, WNTVEC, WNTVCF, & + WNTVCQ, WNTREF, WNTEX + CHARACTER(LEN=1) :: JOBVL +! +! Local array +! ~~~~~~~~~~~ + REAL(KIND=WP) :: RDUMMY(2) +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + LOGICAL LSAME + EXTERNAL LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL DGEMM + EXTERNAL DGEQRF, DLACPY, DLASET, DORGQR, & + DORMQR, XERBLA + +! External subroutines +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL DGEDMD + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC MAX, MIN, INT + !.......................................................... + ! + ! Test the input arguments + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' ) + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTVCF = LSAME(JOBZ,'F') + WNTVCQ = LSAME(JOBZ,'Q') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + WANTQ = LSAME(JOBQ,'Q') + WNTTRF = LSAME(JOBT,'R') + MINMN = MIN(M,N) + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ & + .OR. LSAME(JOBZ,'N')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN + INFO = -4 + ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) ) THEN + INFO = -5 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -6 + ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR. & + (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN + INFO = -7 + ELSE IF ( M < 0 ) THEN + INFO = -8 + ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN + INFO = -9 + ELSE IF ( LDF < M ) THEN + INFO = -11 + ELSE IF ( LDX < MINMN ) THEN + INFO = -13 + ELSE IF ( LDY < MINMN ) THEN + INFO = -15 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -16 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -17 + ELSE IF ( LDZ < M ) THEN + INFO = -22 + ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN + INFO = -25 + ELSE IF ( LDV < N-1 ) THEN + INFO = -27 + ELSE IF ( LDS < N-1 ) THEN + INFO = -29 + END IF +! + IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN + JOBVL = 'V' + ELSE + JOBVL = 'N' + END IF + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN + ! All output except K is void. INFO=1 signals + ! the void input. In case of a workspace query, + ! the minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + MLWQR = MAX(1,N) ! Minimal workspace length for DGEQRF. + MLWORK = MINMN + MLWQR + IF ( LQUERY ) THEN + CALL DGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, & + INFO1 ) + OLWQR = INT(RDUMMY(1)) + OLWORK = MIN(M,N) + OLWQR + END IF + CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + REIG, IMEIG, Z, LDZ, RES, B, LDB, & + V, LDV, S, LDS, WORK, -1, IWORK, & + LIWORK, INFO1 ) + MLWDMD = INT(WORK(1)) + MLWORK = MAX(MLWORK, MINMN + MLWDMD) + IMINWR = IWORK(1) + IF ( LQUERY ) THEN + OLWDMD = INT(WORK(2)) + OLWORK = MAX(OLWORK, MINMN+OLWDMD) + END IF + IF ( WNTVEC .OR. WNTVCF ) THEN + MLWMQR = MAX(1,N) + MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR) + IF ( LQUERY ) THEN + CALL DORMQR( 'L','N', M, N, MINMN, F, LDF, & + WORK, Z, LDZ, WORK, -1, INFO1 ) + OLWMQR = INT(WORK(1)) + OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR) + END IF + END IF + IF ( WANTQ ) THEN + MLWGQR = N + MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR) + IF ( LQUERY ) THEN + CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, & + WORK, -1, INFO1 ) + OLWGQR = INT(WORK(1)) + OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR) + END IF + END IF + IMINWR = MAX( 1, IMINWR ) + MLWORK = MAX( 2, MLWORK ) + IF ( LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31 + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33 + END IF + IF( INFO /= 0 ) THEN + CALL XERBLA( 'DGEDMDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + WORK(1) = MLWORK + WORK(2) = OLWORK + RETURN + END IF +!..... +! Initial QR factorization that is used to represent the +! snapshots as elements of lower dimensional subspace. +! For large scale computation with M >>N , at this place +! one can use an out of core QRF. +! + CALL DGEQRF( M, N, F, LDF, WORK, & + WORK(MINMN+1), LWORK-MINMN, INFO1 ) +! +! Define X and Y as the snapshots representations in the +! orthogonal basis computed in the QR factorization. +! X corresponds to the leading N-1 and Y to the trailing +! N-1 snapshots. + CALL DLASET( 'L', MINMN, N-1, ZERO, ZERO, X, LDX ) + CALL DLACPY( 'U', MINMN, N-1, F, LDF, X, LDX ) + CALL DLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY ) + IF ( M >= 3 ) THEN + CALL DLASET( 'L', MINMN-2, N-2, ZERO, ZERO, & + Y(3,1), LDY ) + END IF +! +! Compute the DMD of the projected snapshot pairs (X,Y) + CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, & + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + REIG, IMEIG, Z, LDZ, RES, B, LDB, V, & + LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, & + IWORK, LIWORK, INFO1 ) + IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN + ! Return with error code. See DGEDMD for details. + INFO = INFO1 + RETURN + ELSE + INFO = INFO1 + END IF +! +! The Ritz vectors (Koopman modes) can be explicitly +! formed or returned in factored form. + IF ( WNTVEC ) THEN + ! Compute the eigenvectors explicitly. + IF ( M > MINMN ) CALL DLASET( 'A', M-MINMN, K, ZERO, & + ZERO, Z(MINMN+1,1), LDZ ) + CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z, & + LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + ELSE IF ( WNTVCF ) THEN + ! Return the Ritz vectors (eigenvectors) in factored + ! form Z*V, where Z contains orthonormal matrix (the + ! product of Q from the initial QR factorization and + ! the SVD/POD_basis returned by DGEDMD in X) and the + ! second factor (the eigenvectors of the Rayleigh + ! quotient) is in the array V, as returned by DGEDMD. + CALL DLACPY( 'A', N, K, X, LDX, Z, LDZ ) + IF ( M > N ) CALL DLASET( 'A', M-N, K, ZERO, ZERO, & + Z(N+1,1), LDZ ) + CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z, & + LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + END IF +! +! Some optional output variables: +! +! The upper triangular factor R in the initial QR +! factorization is optionally returned in the array Y. +! This is useful if this call to DGEDMDQ is to be +! followed by a streaming DMD that is implemented in a +! QR compressed form. + IF ( WNTTRF ) THEN ! Return the upper triangular R in Y + CALL DLASET( 'A', MINMN, N, ZERO, ZERO, Y, LDY ) + CALL DLACPY( 'U', MINMN, N, F, LDF, Y, LDY ) + END IF +! +! The orthonormal/orthogonal factor Q in the initial QR +! factorization is optionally returned in the array F. +! Same as with the triangular factor above, this is +! useful in a streaming DMD. + IF ( WANTQ ) THEN ! Q overwrites F + CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, & + WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + END IF +! + RETURN +! + END SUBROUTINE DGEDMDQ + \ No newline at end of file diff --git a/lapack-netlib/SRC/dgehrd.f b/lapack-netlib/SRC/dgehrd.f index a40c61cb6..d95bbd182 100644 --- a/lapack-netlib/SRC/dgehrd.f +++ b/lapack-netlib/SRC/dgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, $ ONE = 1.0D+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - DOUBLE PRECISION EI + DOUBLE PRECISION EI * .. * .. External Subroutines .. EXTERNAL DAXPY, DGEHD2, DGEMM, DLAHR2, DLARFB, DTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF WORK( 1 ) = LWKOPT END IF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -344,6 +349,7 @@ * Use unblocked code to reduce the rest of the matrix * CALL DGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) +* WORK( 1 ) = LWKOPT * RETURN diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f index 83d16c30e..1db85e9c2 100644 --- a/lapack-netlib/SRC/dgejsv.f +++ b/lapack-netlib/SRC/dgejsv.f @@ -253,7 +253,7 @@ *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; *> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), -*> then V is used as workspace if the pprocedure +*> then V is used as workspace if the procedure *> replaces A with A^t. In that case, [U] is computed *> in V as right singular vectors of A^t and then *> copied back to the U array. This 'W' option is just @@ -362,7 +362,7 @@ *> *> \param[out] IWORK *> \verbatim -*> IWORK is INTEGER array, dimension (M+3*N). +*> IWORK is INTEGER array, dimension (MAX(3,M+3*N)). *> On exit, *> IWORK(1) = the numerical rank determined after the initial *> QR factorization with pivoting. See the descriptions @@ -1386,7 +1386,7 @@ IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 * (this overwrites the copy of R2, as it will not be -* needed in this branch, but it does not overwritte the +* needed in this branch, but it does not overwrite the * Huseholder vectors of Q2.). CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N ) * .. and the rest of the information on Q3 is in @@ -1409,7 +1409,7 @@ END IF * * Second preconditioning finished; continue with Jacobi SVD -* The input matrix is lower trinagular. +* The input matrix is lower triangular. * * Recover the right singular vectors as solution of a well * conditioned triangular matrix equation. @@ -1454,7 +1454,7 @@ * :) .. the input matrix A is very likely a relative of * the Kahan matrix :) * The matrix R2 is inverted. The solution of the matrix equation -* is Q3^T*V3 = the product of the Jacobi rotations (appplied to +* is Q3^T*V3 = the product of the Jacobi rotations (applied to * the lower triangular L3 from the LQ factorization of * R2=L3*Q3), pre-multiplied with the transposed Q3. CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index 013b6c356..255e8732f 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f index ed3372f96..f0eb00a55 100644 --- a/lapack-netlib/SRC/dgelqf.f +++ b/lapack-netlib/SRC/dgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/dgelqt3.f b/lapack-netlib/SRC/dgelqt3.f index 5bcc06a80..ee3bdceb4 100644 --- a/lapack-netlib/SRC/dgelqt3.f +++ b/lapack-netlib/SRC/dgelqt3.f @@ -173,7 +173,8 @@ * * Compute Householder transform when M=1 * - CALL DLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL DLARFG( N, A ( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1) ) * ELSE * diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f index b3b3d8b2d..7dc564f48 100644 --- a/lapack-netlib/SRC/dgelsd.f +++ b/lapack-netlib/SRC/dgelsd.f @@ -59,12 +59,6 @@ *> singular values which are less than RCOND times the largest singular *> value. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -194,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -234,7 +228,7 @@ DOUBLE PRECISION ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM * .. * .. External Subroutines .. - EXTERNAL DGEBRD, DGELQF, DGEQRF, DLABAD, DLACPY, DLALSD, + EXTERNAL DGEBRD, DGELQF, DGEQRF, DLACPY, DLALSD, $ DLASCL, DLASET, DORMBR, DORMLQ, DORMQR, XERBLA * .. * .. External Functions .. @@ -282,7 +276,7 @@ $ LOG( TWO ) ) + 1, 0 ) * IF( INFO.EQ.0 ) THEN - MAXWRK = 0 + MAXWRK = 1 LIWORK = 3*MINMN*NLVL + 11*MINMN MM = M IF( M.GE.N .AND. M.GE.MNTHR ) THEN @@ -378,7 +372,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f index c4190f2e0..38449be7f 100644 --- a/lapack-netlib/SRC/dgelss.f +++ b/lapack-netlib/SRC/dgelss.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup gelss * * ===================================================================== SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, @@ -203,7 +203,7 @@ * .. * .. External Subroutines .. EXTERNAL DBDSQR, DCOPY, DGEBRD, DGELQF, DGEMM, DGEMV, - $ DGEQRF, DLABAD, DLACPY, DLASCL, DLASET, DORGBR, + $ DGEQRF, DLACPY, DLASCL, DLASET, DORGBR, $ DORMBR, DORMLQ, DORMQR, DRSCL, XERBLA * .. * .. External Functions .. @@ -385,7 +385,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max element outside range [SMLNUM,BIGNUM] * @@ -529,7 +528,7 @@ $ LDB, ZERO, WORK, N ) CALL DLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB ) 20 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL DGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 ) CALL DCOPY( N, WORK, 1, B, 1 ) END IF @@ -626,7 +625,7 @@ CALL DLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ), $ LDB ) 40 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL DGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ), $ 1, ZERO, WORK( IWORK ), 1 ) CALL DCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 ) @@ -712,7 +711,7 @@ $ LDB, ZERO, WORK, N ) CALL DLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB ) 60 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL DGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 ) CALL DCOPY( N, WORK, 1, B, 1 ) END IF diff --git a/lapack-netlib/SRC/dgelsy.f b/lapack-netlib/SRC/dgelsy.f index aebab9264..e9fcd9682 100644 --- a/lapack-netlib/SRC/dgelsy.f +++ b/lapack-netlib/SRC/dgelsy.f @@ -115,6 +115,7 @@ *> B is DOUBLE PRECISION array, dimension (LDB,NRHS) *> On entry, the M-by-NRHS right hand side matrix B. *> On exit, the N-by-NRHS solution matrix X. +*> If M = 0 or N = 0, B is not referenced. *> \endverbatim *> *> \param[in] LDB @@ -147,6 +148,7 @@ *> The effective rank of A, i.e., the order of the submatrix *> R11. This is the same as the order of the submatrix T11 *> in the complete orthogonal factorization of A. +*> If NRHS = 0, RANK = 0 on output. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f index 3ba209105..757683f46 100644 --- a/lapack-netlib/SRC/dgemlq.f +++ b/lapack-netlib/SRC/dgemlq.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,11 +161,13 @@ *> block sizes MB and NB returned by ILAENV, DGELQ will use either *> DLASWLQ (if the matrix is wide-and-short) or DGELQT to compute *> the LQ factorization. -*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to +*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in DLAMSWLQ or DGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE DGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -186,7 +189,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -202,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -217,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -245,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -262,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -275,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f index 022cf21e4..608815483 100644 --- a/lapack-netlib/SRC/dgemqr.f +++ b/lapack-netlib/SRC/dgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,12 +161,14 @@ *> block sizes MB and NB returned by ILAENV, DGEQR will use either *> DLATSQR (if the matrix is tall-and-skinny) or DGEQRT to compute *> the QR factorization. -*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to +*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to *> multiply matrix Q by another matrix. *> Further Details in DLATMSQR or DGEMQRT. *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE DGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgeqlf.f b/lapack-netlib/SRC/dgeqlf.f index b8ac0b1a0..a72d9dc76 100644 --- a/lapack-netlib/SRC/dgeqlf.f +++ b/lapack-netlib/SRC/dgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/dgeqp3rk.c b/lapack-netlib/SRC/dgeqp3rk.c new file mode 100644 index 000000000..17a78dd5a --- /dev/null +++ b/lapack-netlib/SRC/dgeqp3rk.c @@ -0,0 +1,1059 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.; + } + work[1] = (doublereal) lwkopt; + return 0; + } + +/* ================================================================== */ + + eps = dlamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.) { + safmin = dlamch_("Safe minimum"); +/* Computing MAX */ + d__1 = *abstol, d__2 = safmin * 2.; + *abstol = f2cmax(d__1,d__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1. <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + work[1] = (doublereal) lwkopt; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "DGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + dlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], & + work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb + + 1], &i__1, &iwork[1], &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + work[1] = (doublereal) lwkopt; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + dlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], & + work[(*n << 1) + 1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &work[*k + 1], &c__1); + *maxc2nrmk = work[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + work[1] = (doublereal) lwkopt; + + return 0; + +/* End of DGEQP3RK */ + +} /* dgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f new file mode 100644 index 000000000..b8e41b39c --- /dev/null +++ b/lapack-netlib/SRC/dgeqp3rk.f @@ -0,0 +1,1082 @@ +*> \brief \b DGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a real +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M orthogonal matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a real M-by-NRHS +*> matrix B with Q(K)**T * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**T * B, the matrix B with the orthogonal +*> transformation Q(K)**T applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M orthogonal matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**T, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a real scalar, +*> v is a real vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> orthogonal matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**T * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. +*> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), +*> where NB is the optimal block size for DGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine DLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> DGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in DGEQP3 routine which uses +*> DLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + DOUBLE PRECISION ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + DOUBLE PRECISION EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL DLAQP2RK, DLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX, ILAENV + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, DNRM2, IDAMAX, ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( DISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( DISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in DLAQP2RK. +* 1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial +* column 2-norms. +* 2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in DLARF subroutine inside DLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = 3*N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'DGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code +* in DLAQP3RK. +* 1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in DLARF subroutine to apply an elementary reflector +* from the left. +* 3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) DLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = DBLE( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + WORK( J ) = DNRM2( M, A( 1, J ), 1 ) + WORK( N+J ) = WORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = IDAMAX( N, WORK( 1 ), 1 ) + MAXC2NRM = WORK( KP1 ) +* +* ==================================================================. +* + IF( DISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = DLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = ZERO + END DO + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = DLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = DLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'DGEQP3RK', ' ', M, N, -1, -1 )) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'DGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL DLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), WORK( 2*N+JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = DBLE( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL DLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + IDAMAX( N-K, WORK( K+1 ), 1 ) + MAXC2NRMK = WORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = ZERO + END DO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = DBLE( LWKOPT ) +* + RETURN +* +* End of DGEQP3RK +* + END diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f index eac8930ce..6ed8f211f 100644 --- a/lapack-netlib/SRC/dgeqr.f +++ b/lapack-netlib/SRC/dgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f index 46d2ee479..aa757e96c 100644 --- a/lapack-netlib/SRC/dgeqrfp.f +++ b/lapack-netlib/SRC/dgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL DGEQR2P, DLARFB, DLARFT, XERBLA @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/dgerqf.f b/lapack-netlib/SRC/dgerqf.f index cca9d6367..435239cc7 100644 --- a/lapack-netlib/SRC/dgerqf.f +++ b/lapack-netlib/SRC/dgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -189,7 +189,7 @@ END IF WORK( 1 ) = LWKOPT * - IF ( .NOT.LQUERY ) THEN + IF( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) $ INFO = -7 END IF diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f index 738a122e8..87a4e702d 100644 --- a/lapack-netlib/SRC/dgesdd.f +++ b/lapack-netlib/SRC/dgesdd.f @@ -55,12 +55,6 @@ *> *> Note that the routine returns VT = V**T, not V. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dgesvdq.f b/lapack-netlib/SRC/dgesvdq.f index 6f9ac703e..a514def8b 100644 --- a/lapack-netlib/SRC/dgesvdq.f +++ b/lapack-netlib/SRC/dgesvdq.f @@ -365,7 +365,7 @@ *> an optimal implementation would do all necessary scaling before calling *> CGESVD and the scaling in CGESVD can be switched off. *> 3. Other comments related to code optimization are given in comments in the -*> code, enlosed in [[double brackets]]. +*> code, enclosed in [[double brackets]]. *> \endverbatim * *> \par Bugs, examples and comments diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f index 5fdb21e45..198bfb0a5 100644 --- a/lapack-netlib/SRC/dgesvj.f +++ b/lapack-netlib/SRC/dgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On entry : *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -365,9 +370,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. DOUBLE PRECISION FASTR( 5 ) @@ -408,6 +413,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -427,7 +440,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -437,11 +450,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/dgetri.f b/lapack-netlib/SRC/dgetri.f index 92ef90c18..7b5a3a1b6 100644 --- a/lapack-netlib/SRC/dgetri.f +++ b/lapack-netlib/SRC/dgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -151,8 +151,9 @@ * INFO = 0 NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index 25f4c12c2..73b505ff7 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -189,7 +189,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, DLANGE + EXTERNAL LSAME, DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DGEQR, DGEMQR, DLASCL, DLASET, @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEM = 1 + WSIZEO = 1 + ELSE IF( M.GE.N ) THEN CALL DGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -294,7 +297,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f index 668deeba8..682c7c30f 100644 --- a/lapack-netlib/SRC/dgetsqrhrt.f +++ b/lapack-netlib/SRC/dgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +215,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +228,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +266,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -346,4 +350,4 @@ * * End of DGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/dgges.f b/lapack-netlib/SRC/dgges.f index 31db23715..b9ffc7982 100644 --- a/lapack-netlib/SRC/dgges.f +++ b/lapack-netlib/SRC/dgges.f @@ -234,8 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If N = 0, LWORK >= 1, else LWORK >= 8*N+16. -*> For good performance , LWORK must generally be larger. +*> If N = 0, LWORK >= 1, else LWORK >= MAX(8*N,6*N+16). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE DGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -321,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -431,7 +430,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgges3.f b/lapack-netlib/SRC/dgges3.f index 7b00d294a..2ef55951a 100644 --- a/lapack-netlib/SRC/dgges3.f +++ b/lapack-netlib/SRC/dgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -273,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE DGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -318,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -362,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -378,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -386,29 +394,33 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL DORGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF CALL DGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL DTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ SDIM, PVSL, PVSR, DIF, WORK, -1, IDUM, 1, $ IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -430,7 +442,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dggev3.f b/lapack-netlib/SRC/dggev3.f index 4bbe8a40f..b970c04c4 100644 --- a/lapack-netlib/SRC/dggev3.f +++ b/lapack-netlib/SRC/dggev3.f @@ -188,7 +188,9 @@ *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER +*> LWORK is INTEGER. +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -217,7 +219,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE DGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -256,9 +259,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -299,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -313,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -321,13 +324,13 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX(1, 8*N, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL DORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF IF( ILV ) THEN CALL DGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, @@ -336,18 +339,21 @@ CALL DLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL DGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +373,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgghd3.f b/lapack-netlib/SRC/dgghd3.f index 43d7a77df..21a668573 100644 --- a/lapack-netlib/SRC/dgghd3.f +++ b/lapack-netlib/SRC/dgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -211,7 +211,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'DGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -885,6 +889,7 @@ IF ( JCOL.LT.IHI ) $ CALL DGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DBLE( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f index 39d27a5c9..edac7f22f 100644 --- a/lapack-netlib/SRC/dggqrf.f +++ b/lapack-netlib/SRC/dggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'DGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'DORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN @@ -287,6 +287,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL DGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) +* WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) * RETURN diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f index ddf4104c5..3b1024c1c 100644 --- a/lapack-netlib/SRC/dggrqf.f +++ b/lapack-netlib/SRC/dggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'DGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'DORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dggsvd3.f b/lapack-netlib/SRC/dggsvd3.f index 503f0d8cc..ee4d11e86 100644 --- a/lapack-netlib/SRC/dggsvd3.f +++ b/lapack-netlib/SRC/dggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -328,7 +328,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dggsvp3.f b/lapack-netlib/SRC/dggsvp3.f index 4e1db3117..485d95b36 100644 --- a/lapack-netlib/SRC/dggsvp3.f +++ b/lapack-netlib/SRC/dggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f index 197a9a626..1008aa8c6 100644 --- a/lapack-netlib/SRC/dgsvj0.f +++ b/lapack-netlib/SRC/dgsvj0.f @@ -52,10 +52,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is DOUBLE PRECISION array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f index 11d1dde2b..f1a099b2a 100644 --- a/lapack-netlib/SRC/dgsvj1.f +++ b/lapack-netlib/SRC/dgsvj1.f @@ -75,10 +75,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is DOUBLE PRECISION array, dimension (LDV,N) -*> If JOBV = 'V', then N rows of V are post-multipled by a +*> If JOBV = 'V', then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/dla_gbrfsx_extended.f b/lapack-netlib/SRC/dla_gbrfsx_extended.f index 5454b1bfc..b331178a9 100644 --- a/lapack-netlib/SRC/dla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/dla_gbrfsx_extended.f @@ -645,7 +645,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/dla_gerfsx_extended.f b/lapack-netlib/SRC/dla_gerfsx_extended.f index 92b0d76d4..bf8260d04 100644 --- a/lapack-netlib/SRC/dla_gerfsx_extended.f +++ b/lapack-netlib/SRC/dla_gerfsx_extended.f @@ -625,7 +625,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL DAXPY( N, 1.0D+0, DY, 1, Y( 1, J ), 1 ) diff --git a/lapack-netlib/SRC/dla_porfsx_extended.f b/lapack-netlib/SRC/dla_porfsx_extended.f index 5c8850fef..94d1087cd 100644 --- a/lapack-netlib/SRC/dla_porfsx_extended.f +++ b/lapack-netlib/SRC/dla_porfsx_extended.f @@ -617,7 +617,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/dla_porpvgrw.f b/lapack-netlib/SRC/dla_porpvgrw.f index 93ad3eb6a..00fdd7ae1 100644 --- a/lapack-netlib/SRC/dla_porpvgrw.f +++ b/lapack-netlib/SRC/dla_porpvgrw.f @@ -134,9 +134,9 @@ * UPPER = LSAME( 'Upper', UPLO ) * -* DPOTRF will have factored only the NCOLSxNCOLS leading minor, so -* we restrict the growth search to that minor and use only the first -* 2*NCOLS workspace entries. +* DPOTRF will have factored only the NCOLSxNCOLS leading submatrix, +* so we restrict the growth search to that submatrix and use only +* the first 2*NCOLS workspace entries. * RPVGRW = 1.0D+0 DO I = 1, 2*NCOLS diff --git a/lapack-netlib/SRC/dla_syrfsx_extended.f b/lapack-netlib/SRC/dla_syrfsx_extended.f index e1cde6fc2..2e06a622b 100644 --- a/lapack-netlib/SRC/dla_syrfsx_extended.f +++ b/lapack-netlib/SRC/dla_syrfsx_extended.f @@ -647,7 +647,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/dlaed2.f b/lapack-netlib/SRC/dlaed2.f index 9b1f1e093..1a53650e8 100644 --- a/lapack-netlib/SRC/dlaed2.f +++ b/lapack-netlib/SRC/dlaed2.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W, +* SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W, * Q2, INDX, INDXC, INDXP, COLTYP, INFO ) * * .. Scalar Arguments .. @@ -28,7 +28,7 @@ * .. Array Arguments .. * INTEGER COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ), * $ INDXQ( * ) -* DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), * $ W( * ), Z( * ) * .. * @@ -123,9 +123,9 @@ *> process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (N) +*> DLAMBDA is DOUBLE PRECISION array, dimension (N) *> A copy of the first K eigenvalues which will be used by *> DLAED3 to form the secular equation. *> \endverbatim @@ -148,7 +148,7 @@ *> \param[out] INDX *> \verbatim *> INDX is INTEGER array, dimension (N) -*> The permutation used to sort the contents of DLAMDA into +*> The permutation used to sort the contents of DLAMBDA into *> ascending order. *> \endverbatim *> @@ -207,7 +207,7 @@ *> Modified by Francoise Tisseur, University of Tennessee *> * ===================================================================== - SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W, + SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W, $ Q2, INDX, INDXC, INDXP, COLTYP, INFO ) * * -- LAPACK computational routine -- @@ -221,7 +221,7 @@ * .. Array Arguments .. INTEGER COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ), $ INDXQ( * ) - DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), $ W( * ), Z( * ) * .. * @@ -300,9 +300,9 @@ * re-integrate the deflated parts from the last pass * DO 20 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) 20 CONTINUE - CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDXC ) + CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC ) DO 30 I = 1, N INDX( I ) = INDXQ( INDXC( I ) ) 30 CONTINUE @@ -324,11 +324,11 @@ DO 40 J = 1, N I = INDX( J ) CALL DCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 ) - DLAMDA( J ) = D( I ) + DLAMBDA( J ) = D( I ) IQ2 = IQ2 + N 40 CONTINUE CALL DLACPY( 'A', N, N, Q2, N, Q, LDQ ) - CALL DCOPY( N, DLAMDA, 1, D, 1 ) + CALL DCOPY( N, DLAMBDA, 1, D, 1 ) GO TO 190 END IF * @@ -421,7 +421,7 @@ PJ = NJ ELSE K = K + 1 - DLAMDA( K ) = D( PJ ) + DLAMBDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ PJ = NJ @@ -433,7 +433,7 @@ * Record the last eigenvalue. * K = K + 1 - DLAMDA( K ) = D( PJ ) + DLAMBDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ * @@ -470,9 +470,9 @@ PSM( CT ) = PSM( CT ) + 1 130 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * I = 1 diff --git a/lapack-netlib/SRC/dlaed3.f b/lapack-netlib/SRC/dlaed3.f index c58944e60..f9982c89e 100644 --- a/lapack-netlib/SRC/dlaed3.f +++ b/lapack-netlib/SRC/dlaed3.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX, +* SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX, * CTOT, W, S, INFO ) * * .. Scalar Arguments .. @@ -27,7 +27,7 @@ * .. * .. Array Arguments .. * INTEGER CTOT( * ), INDX( * ) -* DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), * $ S( * ), W( * ) * .. * @@ -44,12 +44,6 @@ *> being combined by the matrix of eigenvectors of the K-by-K system *> which is solved here. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -104,14 +98,12 @@ *> RHO >= 0 required. *> \endverbatim *> -*> \param[in,out] DLAMDA +*> \param[in] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (K) +*> DLAMBDA is DOUBLE PRECISION array, dimension (K) *> The first K elements of this array contain the old roots *> of the deflated updating problem. These are the poles -*> of the secular equation. May be changed on output by -*> having lowest order bit set to zero on Cray X-MP, Cray Y-MP, -*> Cray-2, or Cray C-90, as described above. +*> of the secular equation. *> \endverbatim *> *> \param[in] Q2 @@ -180,7 +172,7 @@ *> Modified by Francoise Tisseur, University of Tennessee *> * ===================================================================== - SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX, + SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX, $ CTOT, W, S, INFO ) * * -- LAPACK computational routine -- @@ -193,7 +185,7 @@ * .. * .. Array Arguments .. INTEGER CTOT( * ), INDX( * ) - DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), $ S( * ), W( * ) * .. * @@ -208,8 +200,8 @@ DOUBLE PRECISION TEMP * .. * .. External Functions .. - DOUBLE PRECISION DLAMC3, DNRM2 - EXTERNAL DLAMC3, DNRM2 + DOUBLE PRECISION DNRM2 + EXTERNAL DNRM2 * .. * .. External Subroutines .. EXTERNAL DCOPY, DGEMM, DLACPY, DLAED4, DLASET, XERBLA @@ -240,29 +232,9 @@ IF( K.EQ.0 ) $ RETURN * -* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), -* which on any of these machines zeros out the bottommost -* bit of DLAMDA(I) if it is 1; this makes the subsequent -* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DLAMDA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DLAMDA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, K - DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) - 10 CONTINUE * DO 20 J = 1, K - CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO ) + CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO ) * * If the zero finder fails, the computation is terminated. * @@ -293,10 +265,10 @@ CALL DCOPY( K, Q, LDQ+1, W, 1 ) DO 60 J = 1, K DO 40 I = 1, J - 1 - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 40 CONTINUE DO 50 I = J + 1, K - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 50 CONTINUE 60 CONTINUE DO 70 I = 1, K diff --git a/lapack-netlib/SRC/dlaed8.f b/lapack-netlib/SRC/dlaed8.f index 3631fb456..5d1d9144d 100644 --- a/lapack-netlib/SRC/dlaed8.f +++ b/lapack-netlib/SRC/dlaed8.f @@ -19,7 +19,7 @@ * =========== * * SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO, -* CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR, +* CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR, * GIVCOL, GIVNUM, INDXP, INDX, INFO ) * * .. Scalar Arguments .. @@ -30,7 +30,7 @@ * .. Array Arguments .. * INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), * $ INDXQ( * ), PERM( * ) -* DOUBLE PRECISION D( * ), DLAMDA( * ), GIVNUM( 2, * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), GIVNUM( 2, * ), * $ Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * ) * .. * @@ -141,9 +141,9 @@ *> process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (N) +*> DLAMBDA is DOUBLE PRECISION array, dimension (N) *> A copy of the first K eigenvalues which will be used by *> DLAED3 to form the secular equation. *> \endverbatim @@ -238,7 +238,7 @@ * * ===================================================================== SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO, - $ CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR, + $ CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR, $ GIVCOL, GIVNUM, INDXP, INDX, INFO ) * * -- LAPACK computational routine -- @@ -253,7 +253,7 @@ * .. Array Arguments .. INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), $ INDXQ( * ), PERM( * ) - DOUBLE PRECISION D( * ), DLAMDA( * ), GIVNUM( 2, * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), GIVNUM( 2, * ), $ Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * ) * .. * @@ -339,14 +339,14 @@ INDXQ( I ) = INDXQ( I ) + CUTPNT 20 CONTINUE DO 30 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) W( I ) = Z( INDXQ( I ) ) 30 CONTINUE I = 1 J = CUTPNT + 1 - CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX ) + CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX ) DO 40 I = 1, N - D( I ) = DLAMDA( INDX( I ) ) + D( I ) = DLAMBDA( INDX( I ) ) Z( I ) = W( INDX( I ) ) 40 CONTINUE * @@ -464,7 +464,7 @@ ELSE K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM JLAM = J END IF @@ -476,26 +476,26 @@ * K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM * 110 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * IF( ICOMPQ.EQ.0 ) THEN DO 120 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) 120 CONTINUE ELSE DO 130 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) CALL DCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 ) 130 CONTINUE @@ -506,9 +506,9 @@ * IF( K.LT.N ) THEN IF( ICOMPQ.EQ.0 ) THEN - CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) ELSE - CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) CALL DLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, $ Q( 1, K+1 ), LDQ ) END IF diff --git a/lapack-netlib/SRC/dlaed9.f b/lapack-netlib/SRC/dlaed9.f index b88cdd907..0d209c2c2 100644 --- a/lapack-netlib/SRC/dlaed9.f +++ b/lapack-netlib/SRC/dlaed9.f @@ -18,15 +18,15 @@ * Definition: * =========== * -* SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W, -* S, LDS, INFO ) +* SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA, +* W, S, LDS, INFO ) * * .. Scalar Arguments .. * INTEGER INFO, K, KSTART, KSTOP, LDQ, LDS, N * DOUBLE PRECISION RHO * .. * .. Array Arguments .. -* DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ), * $ W( * ) * .. * @@ -96,9 +96,9 @@ *> RHO >= 0 required. *> \endverbatim *> -*> \param[in] DLAMDA +*> \param[in] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (K) +*> DLAMBDA is DOUBLE PRECISION array, dimension (K) *> The first K elements of this array contain the old roots *> of the deflated updating problem. These are the poles *> of the secular equation. @@ -151,8 +151,8 @@ *> at Berkeley, USA * * ===================================================================== - SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W, - $ S, LDS, INFO ) + SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA, + $ W, S, LDS, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -163,7 +163,7 @@ DOUBLE PRECISION RHO * .. * .. Array Arguments .. - DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ), $ W( * ) * .. * @@ -174,8 +174,8 @@ DOUBLE PRECISION TEMP * .. * .. External Functions .. - DOUBLE PRECISION DLAMC3, DNRM2 - EXTERNAL DLAMC3, DNRM2 + DOUBLE PRECISION DNRM2 + EXTERNAL DNRM2 * .. * .. External Subroutines .. EXTERNAL DCOPY, DLAED4, XERBLA @@ -212,30 +212,9 @@ * IF( K.EQ.0 ) $ RETURN -* -* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), -* which on any of these machines zeros out the bottommost -* bit of DLAMDA(I) if it is 1; this makes the subsequent -* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DLAMDA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DLAMDA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, N - DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) - 10 CONTINUE * DO 20 J = KSTART, KSTOP - CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO ) + CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO ) * * If the zero finder fails, the computation is terminated. * @@ -261,10 +240,10 @@ CALL DCOPY( K, Q, LDQ+1, W, 1 ) DO 70 J = 1, K DO 50 I = 1, J - 1 - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 50 CONTINUE DO 60 I = J + 1, K - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 60 CONTINUE 70 CONTINUE DO 80 I = 1, K diff --git a/lapack-netlib/SRC/dlals0.f b/lapack-netlib/SRC/dlals0.f index cfca22280..928405e22 100644 --- a/lapack-netlib/SRC/dlals0.f +++ b/lapack-netlib/SRC/dlals0.f @@ -389,6 +389,11 @@ $ ( POLES( I, 2 ).EQ.ZERO ) ) THEN WORK( I ) = ZERO ELSE +* +* Use calls to the subroutine DLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* WORK( I ) = POLES( I, 2 )*Z( I ) / $ ( DLAMC3( POLES( I, 2 ), DSIGJ )- $ DIFLJ ) / ( POLES( I, 2 )+DJ ) @@ -440,6 +445,11 @@ IF( Z( J ).EQ.ZERO ) THEN WORK( I ) = ZERO ELSE +* +* Use calls to the subroutine DLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* WORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1, $ 2 ) )-DIFR( I, 1 ) ) / $ ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 ) diff --git a/lapack-netlib/SRC/dlalsa.f b/lapack-netlib/SRC/dlalsa.f index da8e0fa17..d89f5d2f9 100644 --- a/lapack-netlib/SRC/dlalsa.f +++ b/lapack-netlib/SRC/dlalsa.f @@ -43,9 +43,9 @@ *> *> \verbatim *> -*> DLALSA is an itermediate step in solving the least squares problem +*> DLALSA is an intermediate step in solving the least squares problem *> by computing the SVD of the coefficient matrix in compact form (The -*> singular vectors are computed as products of simple orthorgonal +*> singular vectors are computed as products of simple orthogonal *> matrices.). *> *> If ICOMPQ = 0, DLALSA applies the inverse of the left singular vector diff --git a/lapack-netlib/SRC/dlalsd.f b/lapack-netlib/SRC/dlalsd.f index d22c45dc6..706ac4c90 100644 --- a/lapack-netlib/SRC/dlalsd.f +++ b/lapack-netlib/SRC/dlalsd.f @@ -47,12 +47,6 @@ *> problem; in this case a minimum norm solution is returned. *> The actual singular values are returned in D in ascending order. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f index 70e78f4b1..07ef1bd57 100644 --- a/lapack-netlib/SRC/dlamswlq.f +++ b/lapack-netlib/SRC/dlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,29 +192,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE DLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, CTR, LW + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, CTR, LW, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -223,52 +228,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +415,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMSWLQ diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f index 962a31476..023db5ac9 100644 --- a/lapack-netlib/SRC/dlamtsqr.f +++ b/lapack-netlib/SRC/dlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,29 +193,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE DLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -225,12 +229,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +243,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +264,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL DGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +421,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMTSQR diff --git a/lapack-netlib/SRC/dlaqp2rk.c b/lapack-netlib/SRC/dlaqp2rk.c new file mode 100644 index 000000000..de216ad97 --- /dev/null +++ b/lapack-netlib/SRC/dlaqp2rk.c @@ -0,0 +1,923 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = ZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + dlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + tau[kk] = 0.; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since DLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by DLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by DLARFG is covered by checking TAU(KK) for NaN. */ + + if (disnan_(&tau[kk])) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[kk]; + *relmaxc2nrmk = tau[kk]; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + aikk = a[i__ + kk * a_dim1]; + a[i__ + kk * a_dim1] = 1.; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + dlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[ + kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + a[i__ + kk * a_dim1] = aikk; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + d__2 = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j]; + temp = 1. - d__2 * d__2; + temp = f2cmax(temp,0.); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = dnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + return 0; + +/* End of DLAQP2RK */ + +} /* dlaqp2rk_ */ + diff --git a/lapack-netlib/SRC/dlaqp2rk.f b/lapack-netlib/SRC/dlaqp2rk.f new file mode 100644 index 000000000..aecd6bb69 --- /dev/null +++ b/lapack-netlib/SRC/dlaqp2rk.f @@ -0,0 +1,713 @@ +*> \brief \b DLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), +* $ WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of a real matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine DGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine DGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (N-1) +*> Used in DLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), + $ WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + DOUBLE PRECISION AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL DLARF, DLARFG, DSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL DSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = ZERO. +* + IF( I.LT.M ) THEN + CALL DLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = ZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since DLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by DLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by DLARFG is covered by checking TAU(KK) for NaN. +* + IF( DISNAN( TAU(KK) ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( KK ) + RELMAXC2NRMK = TAU( KK ) +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = ONE + CALL DLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = DNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = ZERO + END DO +* + RETURN +* +* End of DLAQP2RK +* + END diff --git a/lapack-netlib/SRC/dlaqp3rk.c b/lapack-netlib/SRC/dlaqp3rk.c new file mode 100644 index 000000000..e8c61c257 --- /dev/null +++ b/lapack-netlib/SRC/dlaqp3rk.c @@ -0,0 +1,1113 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + dswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1); + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + dlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + tau[k] = 0.; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since DLARFG cannot produce TAU(K) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by DLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by DLARFG is covered by checking TAU(K) for NaN. */ + + if (disnan_(&tau[k])) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[k]; + *relmaxc2nrmk = tau[k]; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, & + a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, & + c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + aik = a[i__ + k * a_dim1]; + a[i__ + k * a_dim1] = 1.; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + dgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + f[j + k * f_dim1] = 0.; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + d__1 = -tau[k]; + dgemv_("Transpose", &i__1, &i__2, &d__1, &a[i__ + a_dim1], lda, & + a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + dgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, + &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], + lda); + } + + a[i__ + k * a_dim1] = aik; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j]; +/* Computing MAX */ + d__1 = 0., d__2 = (temp + 1.) * (1. - temp); + temp = f2cmax(d__1,d__2); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ + + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ + + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* DNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(DLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = dnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of DLAQP3RK */ + +} /* dlaqp3rk_ */ + diff --git a/lapack-netlib/SRC/dlaqp3rk.f b/lapack-netlib/SRC/dlaqp3rk.f new file mode 100644 index 000000000..8139345ed --- /dev/null +++ b/lapack-netlib/SRC/dlaqp3rk.f @@ -0,0 +1,935 @@ +*> \brief \b DLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* +* .. Scalar Arguments .. +* LOGICAL DONE +* INTEGER KB, LDA, LDF, M, N, NB, NRHS, IOFFSET +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), +* $ VN1( * ), VN2( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine DGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine DGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is DOUBLE PRECISION array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is DOUBLE PRECISION array, dimension (LDF,NB) +*> Matrix F**T = L*(Y**T)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), + $ VN1( * ), VN2( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + DOUBLE PRECISION AIK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGEMV, DLARFG, DSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL DSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL DSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. +* + IF( K.GT.1 ) THEN + CALL DGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 ) + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL DLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = ZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since DLARFG cannot produce TAU(K) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by DLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by DLARFG is covered by checking TAU(K) for NaN. +* + IF( DISNAN( TAU(K) ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( K ) + RELMAXC2NRMK = TAU( K ) +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = ONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL DGEMV( 'Transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ ZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = ZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL DGEMV( 'Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, ZERO, + $ AUXV( 1 ), 1 ) +* + CALL DGEMV( 'No transpose', N+NRHS, K-1, ONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. +* + IF( K.LT.N+NRHS ) THEN + CALL DGEMV( 'No transpose', N+NRHS-K, K, -ONE, + $ F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE, + $ A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* DNRM2 does not fail on vectors with norm below the value of +* SQRT(DLAMCH('S')) +* + VN1( LSTICC ) = DNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of DLAQP3RK +* + END diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f index c4cb95fd3..84cb96bcb 100644 --- a/lapack-netlib/SRC/dlaqz0.f +++ b/lapack-netlib/SRC/dlaqz0.f @@ -102,7 +102,7 @@ *> Anal., 29(2006), pp. 199--227. *> *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift, -*> multipole rational QZ method with agressive early deflation" +*> multipole rational QZ method with aggressive early deflation" *> \endverbatim * * Arguments: @@ -332,7 +332,7 @@ CHARACTER :: JBCMPZ*3 * External Functions - EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD, + EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, $ DLARTG, DROT DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS LOGICAL, EXTERNAL :: LSAME @@ -482,7 +482,6 @@ * Get machine constants SAFMIN = DLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) @@ -567,7 +566,7 @@ DO WHILE ( K.GE.ISTART2 ) IF( ABS( B( K, K ) ) .LT. BTOL ) THEN -* A diagonal element of B is negligable, move it +* A diagonal element of B is negligible, move it * to the top and deflate it DO K2 = K, ISTART2+1, -1 diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f index 10ab6461e..2c7ea59b0 100644 --- a/lapack-netlib/SRC/dlarfb_gett.f +++ b/lapack-netlib/SRC/dlarfb_gett.f @@ -451,7 +451,7 @@ IF( LNOTIDENT ) THEN * * col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, -* V1 is not an identy matrix, but unit lower-triangular +* V1 is not an identity matrix, but unit lower-triangular * V1 stored in A1 (diagonal ones are not stored). * * diff --git a/lapack-netlib/SRC/dlarfgp.f b/lapack-netlib/SRC/dlarfgp.f index 69845056d..a8cf1b31e 100644 --- a/lapack-netlib/SRC/dlarfgp.f +++ b/lapack-netlib/SRC/dlarfgp.f @@ -97,7 +97,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup larfgp * * ===================================================================== SUBROUTINE DLARFGP( N, ALPHA, X, INCX, TAU ) @@ -122,7 +122,7 @@ * .. * .. Local Scalars .. INTEGER J, KNT - DOUBLE PRECISION BETA, BIGNUM, SAVEALPHA, SMLNUM, XNORM + DOUBLE PRECISION BETA, BIGNUM, EPS, SAVEALPHA, SMLNUM, XNORM * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLAPY2, DNRM2 @@ -141,11 +141,12 @@ RETURN END IF * + EPS = DLAMCH( 'Precision' ) XNORM = DNRM2( N-1, X, INCX ) * - IF( XNORM.EQ.ZERO ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN * -* H = [+/-1, 0; I], sign chosen so ALPHA >= 0 +* H = [+/-1, 0; I], sign chosen so ALPHA >= 0. * IF( ALPHA.GE.ZERO ) THEN * When TAU.eq.ZERO, the vector is special-cased to be diff --git a/lapack-netlib/SRC/dlarre.f b/lapack-netlib/SRC/dlarre.f index 70f59b829..1cceed1a2 100644 --- a/lapack-netlib/SRC/dlarre.f +++ b/lapack-netlib/SRC/dlarre.f @@ -51,7 +51,7 @@ *> DSTEMR to compute the eigenvectors of T. *> The accuracy varies depending on whether bisection is used to *> find a few eigenvalues or the dqds algorithm (subroutine DLASQ2) to -*> conpute all and then discard any unwanted one. +*> compute all and then discard any unwanted one. *> As an added benefit, DLARRE also outputs the n *> Gerschgorin intervals for the matrices L_i D_i L_i^T. *> \endverbatim diff --git a/lapack-netlib/SRC/dlaruv.f b/lapack-netlib/SRC/dlaruv.f index 0f5c9541d..1a4ce4009 100644 --- a/lapack-netlib/SRC/dlaruv.f +++ b/lapack-netlib/SRC/dlaruv.f @@ -382,6 +382,11 @@ $ 1537 / * .. * .. Executable Statements .. +* +* Quick return for N < 1 + IF ( N < 1 ) THEN + RETURN + END IF * I1 = ISEED( 1 ) I2 = ISEED( 2 ) diff --git a/lapack-netlib/SRC/dlas2.f b/lapack-netlib/SRC/dlas2.f index ea929e86f..ff75e3942 100644 --- a/lapack-netlib/SRC/dlas2.f +++ b/lapack-netlib/SRC/dlas2.f @@ -93,9 +93,7 @@ *> infinite. *> *> Overflow will not occur unless the largest singular value itself -*> overflows, or is within a few ulps of overflow. (On machines with -*> partial overflow, like the Cray, overflow may occur if the largest -*> singular value is within a factor of 2 of overflow.) +*> overflows, or is within a few ulps of overflow. *> *> Underflow is harmless if underflow is gradual. Otherwise, results *> may correspond to a matrix modified by perturbations of size near diff --git a/lapack-netlib/SRC/dlasd0.f b/lapack-netlib/SRC/dlasd0.f index 215dc8717..6d8d4e2db 100644 --- a/lapack-netlib/SRC/dlasd0.f +++ b/lapack-netlib/SRC/dlasd0.f @@ -79,10 +79,11 @@ *> On exit, E has been destroyed. *> \endverbatim *> -*> \param[out] U +*> \param[in,out] U *> \verbatim *> U is DOUBLE PRECISION array, dimension (LDU, N) -*> On exit, U contains the left singular vectors. +*> On exit, U contains the left singular vectors, +*> if U passed in as (N, N) Identity. *> \endverbatim *> *> \param[in] LDU @@ -91,10 +92,11 @@ *> On entry, leading dimension of U. *> \endverbatim *> -*> \param[out] VT +*> \param[in,out] VT *> \verbatim *> VT is DOUBLE PRECISION array, dimension (LDVT, M) -*> On exit, VT**T contains the right singular vectors. +*> On exit, VT**T contains the right singular vectors, +*> if VT passed in as (M, M) Identity. *> \endverbatim *> *> \param[in] LDVT diff --git a/lapack-netlib/SRC/dlasd3.f b/lapack-netlib/SRC/dlasd3.f index df939efc5..44957377b 100644 --- a/lapack-netlib/SRC/dlasd3.f +++ b/lapack-netlib/SRC/dlasd3.f @@ -44,13 +44,6 @@ *> appropriate calls to DLASD4 and then updates the singular *> vectors by matrix multiplication. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. -*> *> DLASD3 is called from DLASD1. *> \endverbatim * @@ -103,7 +96,7 @@ *> The leading dimension of the array Q. LDQ >= K. *> \endverbatim *> -*> \param[in,out] DSIGMA +*> \param[in] DSIGMA *> \verbatim *> DSIGMA is DOUBLE PRECISION array, dimension(K) *> The first K elements of this array contain the old roots @@ -249,8 +242,8 @@ DOUBLE PRECISION RHO, TEMP * .. * .. External Functions .. - DOUBLE PRECISION DLAMC3, DNRM2 - EXTERNAL DLAMC3, DNRM2 + DOUBLE PRECISION DNRM2 + EXTERNAL DNRM2 * .. * .. External Subroutines .. EXTERNAL DCOPY, DGEMM, DLACPY, DLASCL, DLASD4, XERBLA @@ -310,27 +303,6 @@ RETURN END IF * -* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I), -* which on any of these machines zeros out the bottommost -* bit of DSIGMA(I) if it is 1; this makes the subsequent -* subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DSIGMA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DSIGMA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DSIGMA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 20 I = 1, K - DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I ) - 20 CONTINUE -* * Keep a copy of Z. * CALL DCOPY( K, Z, 1, Q, 1 ) diff --git a/lapack-netlib/SRC/dlasd8.f b/lapack-netlib/SRC/dlasd8.f index a769bdb22..73c3ef0b4 100644 --- a/lapack-netlib/SRC/dlasd8.f +++ b/lapack-netlib/SRC/dlasd8.f @@ -121,14 +121,12 @@ *> The leading dimension of DIFR, must be at least K. *> \endverbatim *> -*> \param[in,out] DSIGMA +*> \param[in] DSIGMA *> \verbatim *> DSIGMA is DOUBLE PRECISION array, dimension ( K ) *> On entry, the first K elements of this array contain the old *> roots of the deflated updating problem. These are the poles *> of the secular equation. -*> On exit, the elements of DSIGMA may be very slightly altered -*> in value. *> \endverbatim *> *> \param[out] WORK @@ -227,27 +225,6 @@ RETURN END IF * -* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I), -* which on any of these machines zeros out the bottommost -* bit of DSIGMA(I) if it is 1; this makes the subsequent -* subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DSIGMA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DSIGMA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, K - DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I ) - 10 CONTINUE -* * Book keeping. * IWK1 = 1 @@ -312,6 +289,11 @@ DSIGJP = -DSIGMA( J+1 ) END IF WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ ) +* +* Use calls to the subroutine DLAMC3 to enforce the parentheses +* (x+y)+z. The goal is to prevent optimizing compilers +* from doing x+(y+z). +* DO 60 I = 1, J - 1 WORK( I ) = Z( I ) / ( DLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ ) $ / ( DSIGMA( I )+DJ ) diff --git a/lapack-netlib/SRC/dlassq.f90 b/lapack-netlib/SRC/dlassq.f90 index fddd1bf38..37626844b 100644 --- a/lapack-netlib/SRC/dlassq.f90 +++ b/lapack-netlib/SRC/dlassq.f90 @@ -34,28 +34,15 @@ !> !> \verbatim !> -!> DLASSQ returns the values scl and smsq such that +!> DLASSQ returns the values scale_out and sumsq_out such that !> -!> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, +!> (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq, !> -!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is +!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is !> assumed to be non-negative. !> !> scale and sumsq must be supplied in SCALE and SUMSQ and -!> scl and smsq are overwritten on SCALE and SUMSQ respectively. -!> -!> If scale * sqrt( sumsq ) > tbig then -!> we require: scale >= sqrt( TINY*EPS ) / sbig on entry, -!> and if 0 < scale * sqrt( sumsq ) < tsml then -!> we require: scale <= sqrt( HUGE ) / ssml on entry, -!> where -!> tbig -- upper threshold for values whose square is representable; -!> sbig -- scaling constant for big numbers; \see la_constants.f90 -!> tsml -- lower threshold for values whose square is representable; -!> ssml -- scaling constant for small numbers; \see la_constants.f90 -!> and -!> TINY*EPS -- tiniest representable number; -!> HUGE -- biggest representable number. +!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively. !> !> \endverbatim ! @@ -72,7 +59,7 @@ !> \verbatim !> X is DOUBLE PRECISION array, dimension (1+(N-1)*abs(INCX)) !> The vector for which a scaled sum of squares is computed. -!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. +!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. !> \endverbatim !> !> \param[in] INCX @@ -82,24 +69,24 @@ !> If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n !> If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n !> If INCX = 0, x isn't a vector so there is no need to call -!> this subroutine. If you call it anyway, it will count x(1) +!> this subroutine. If you call it anyway, it will count x(1) !> in the vector norm N times. !> \endverbatim !> !> \param[in,out] SCALE !> \verbatim !> SCALE is DOUBLE PRECISION -!> On entry, the value scale in the equation above. -!> On exit, SCALE is overwritten with scl , the scaling factor +!> On entry, the value scale in the equation above. +!> On exit, SCALE is overwritten by scale_out, the scaling factor !> for the sum of squares. !> \endverbatim !> !> \param[in,out] SUMSQ !> \verbatim !> SUMSQ is DOUBLE PRECISION -!> On entry, the value sumsq in the equation above. -!> On exit, SUMSQ is overwritten with smsq , the basic sum of -!> squares from which scl has been factored out. +!> On entry, the value sumsq in the equation above. +!> On exit, SUMSQ is overwritten by sumsq_out, the basic sum of +!> squares from which scale_out has been factored out. !> \endverbatim ! ! Authors: @@ -130,10 +117,10 @@ !> !> \endverbatim ! -!> \ingroup OTHERauxiliary +!> \ingroup lassq ! ! ===================================================================== -subroutine DLASSQ( n, x, incx, scl, sumsq ) +subroutine DLASSQ( n, x, incx, scale, sumsq ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, one=>done, & sbig=>dsbig, ssml=>dssml, tbig=>dtbig, tsml=>dtsml @@ -145,7 +132,7 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) ! ! .. Scalar Arguments .. integer :: incx, n - real(wp) :: scl, sumsq + real(wp) :: scale, sumsq ! .. ! .. Array Arguments .. real(wp) :: x(*) @@ -158,10 +145,10 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) ! ! Quick return if possible ! - if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return - if( sumsq == zero ) scl = one - if( scl == zero ) then - scl = one + if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return + if( sumsq == zero ) scale = one + if( scale == zero ) then + scale = one sumsq = zero end if if (n <= 0) then @@ -198,15 +185,27 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) ! Put the existing sum of squares into one of the accumulators ! if( sumsq > zero ) then - ax = scl*sqrt( sumsq ) + ax = scale*sqrt( sumsq ) if (ax > tbig) then -! We assume scl >= sqrt( TINY*EPS ) / sbig - abig = abig + (scl*sbig)**2 * sumsq + if (scale > one) then + scale = scale * sbig + abig = abig + scale * (scale * sumsq) + else + ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable + abig = abig + scale * (scale * (sbig * (sbig * sumsq))) + end if else if (ax < tsml) then -! We assume scl <= sqrt( HUGE ) / ssml - if (notbig) asml = asml + (scl*ssml)**2 * sumsq + if (notbig) then + if (scale < one) then + scale = scale * ssml + asml = asml + scale * (scale * sumsq) + else + ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable + asml = asml + scale * (scale * (ssml * (ssml * sumsq))) + end if + end if else - amed = amed + scl**2 * sumsq + amed = amed + scale * (scale * sumsq) end if end if ! @@ -220,7 +219,7 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) if (amed > zero .or. LA_ISNAN(amed)) then abig = abig + (amed*sbig)*sbig end if - scl = one / sbig + scale = one / sbig sumsq = abig else if (asml > zero) then ! @@ -236,17 +235,17 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) ymin = asml ymax = amed end if - scl = one + scale = one sumsq = ymax**2*( one + (ymin/ymax)**2 ) else - scl = one / ssml + scale = one / ssml sumsq = asml end if else ! ! Otherwise all values are mid-range or zero ! - scl = one + scale = one sumsq = amed end if return diff --git a/lapack-netlib/SRC/dlasv2.f b/lapack-netlib/SRC/dlasv2.f index 64a06dee1..cb2bf51c4 100644 --- a/lapack-netlib/SRC/dlasv2.f +++ b/lapack-netlib/SRC/dlasv2.f @@ -124,9 +124,7 @@ *> infinite. *> *> Overflow will not occur unless the largest singular value itself -*> overflows or is within a few ulps of overflow. (On machines with -*> partial overflow, like the Cray, overflow may occur if the largest -*> singular value is within a factor of 2 of overflow.) +*> overflows or is within a few ulps of overflow. *> *> Underflow is harmless if underflow is gradual. Otherwise, results *> may correspond to a matrix modified by perturbations of size near diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f index fb8857145..636c12dc8 100644 --- a/lapack-netlib/SRC/dlaswlq.f +++ b/lapack-netlib/SRC/dlaswlq.f @@ -99,18 +99,22 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim -*> The dimension of the array WORK. LWORK >= MB*M. +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -158,33 +162,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL DGELQT, DTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -195,12 +203,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LT.0 ) THEN INFO = -4 @@ -208,60 +223,62 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL DTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF +* + WORK( 1 ) = LWMIN * - WORK( 1 ) = M * MB RETURN * * End of DLASWLQ diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f index be156bee2..b282f4227 100644 --- a/lapack-netlib/SRC/dlatrs.f +++ b/lapack-netlib/SRC/dlatrs.f @@ -261,6 +261,9 @@ DOUBLE PRECISION BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS, $ TMAX, TSCAL, USCAL, XBND, XJ, XMAX * .. +* .. Local Arrays .. + DOUBLE PRECISION WORK(1) +* .. * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX @@ -362,7 +365,7 @@ * A is upper triangular. * DO J = 2, N - TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ), $ TMAX ) END DO ELSE @@ -371,7 +374,7 @@ * DO J = 1, N - 1 TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1, - $ SUMJ ), TMAX ) + $ WORK ), TMAX ) END DO END IF * diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f index b4a98bc78..d18675b2d 100644 --- a/lapack-netlib/SRC/dlatrs3.f +++ b/lapack-netlib/SRC/dlatrs3.f @@ -151,13 +151,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +169,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +258,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -292,15 +297,24 @@ * row. WORK( I+KK*LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -574,7 +588,7 @@ * Prepare the linear update to be executed with GEMM. * For each column, compute a consistent scaling, a * scaling factor to survive the linear update, and -* rescale the column segments, if necesssary. Then +* rescale the column segments, if necessary. Then * the linear update is safely executed. * DO KK = 1, K2-K1 @@ -649,6 +663,9 @@ END IF END DO END DO +* + WORK( 1 ) = LWMIN +* RETURN * * End of DLATRS3 diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f index f5cbb76bb..0000aab68 100644 --- a/lapack-netlib/SRC/dlatsqr.f +++ b/lapack-netlib/SRC/dlatsqr.f @@ -101,14 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> The dimension of the array WORK. LWORK >= NB*N. +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -160,27 +164,29 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T(LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME @@ -197,6 +203,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -204,65 +217,67 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF +* IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF * - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of DLATSQR diff --git a/lapack-netlib/SRC/dorbdb1.f b/lapack-netlib/SRC/dorbdb1.f index dae18f5df..b5b2d1362 100644 --- a/lapack-netlib/SRC/dorbdb1.f +++ b/lapack-netlib/SRC/dorbdb1.f @@ -37,7 +37,7 @@ *>\verbatim *> *> DORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f index a0dacbb16..0b4ad732c 100644 --- a/lapack-netlib/SRC/dorbdb2.f +++ b/lapack-netlib/SRC/dorbdb2.f @@ -37,7 +37,7 @@ *>\verbatim *> *> DORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dorbdb3.f b/lapack-netlib/SRC/dorbdb3.f index 7b3727956..79b10a5d3 100644 --- a/lapack-netlib/SRC/dorbdb3.f +++ b/lapack-netlib/SRC/dorbdb3.f @@ -37,7 +37,7 @@ *>\verbatim *> *> DORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f index 08604be45..985be3277 100644 --- a/lapack-netlib/SRC/dorbdb4.f +++ b/lapack-netlib/SRC/dorbdb4.f @@ -38,7 +38,7 @@ *>\verbatim *> *> DORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dorbdb5.f b/lapack-netlib/SRC/dorbdb5.f index 6e057a05f..cbd58ae54 100644 --- a/lapack-netlib/SRC/dorbdb5.f +++ b/lapack-netlib/SRC/dorbdb5.f @@ -148,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup unbdb5 * * ===================================================================== SUBROUTINE DORBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -169,18 +169,21 @@ * ===================================================================== * * .. Parameters .. + DOUBLE PRECISION REALZERO + PARAMETER ( REALZERO = 0.0D0 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D0, ZERO = 0.0D0 ) * .. * .. Local Scalars .. INTEGER CHILDINFO, I, J + DOUBLE PRECISION EPS, NORM, SCL, SSQ * .. * .. External Subroutines .. - EXTERNAL DORBDB6, XERBLA + EXTERNAL DLASSQ, DORBDB6, DSCAL, XERBLA * .. * .. External Functions .. - DOUBLE PRECISION DNRM2 - EXTERNAL DNRM2 + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DLAMCH, DNRM2 * .. * .. Intrinsic Function .. INTRINSIC MAX @@ -213,16 +216,33 @@ RETURN END IF * -* Project X onto the orthogonal complement of Q + EPS = DLAMCH( 'Precision' ) * - CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2, - $ WORK, LWORK, CHILDINFO ) +* Project X onto the orthogonal complement of Q if X is nonzero * -* If the projection is nonzero, then return + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) * - IF( DNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN - RETURN + IF( NORM .GT. N * EPS ) THEN +* Scale vector to unit norm to avoid problems in the caller code. +* Computing the reciprocal is undesirable but +* * xLASCL cannot be used because of the vector increments and +* * the round-off error has a negligible impact on +* orthogonalization. + CALL DSCAL( M1, ONE / NORM, X1, INCX1 ) + CALL DSCAL( M2, ONE / NORM, X2, INCX2 ) + CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, + $ LDQ2, WORK, LWORK, CHILDINFO ) +* +* If the projection is nonzero, then return +* + IF( DNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN + RETURN + END IF END IF * * Project each standard basis vector e_1,...,e_M1 in turn, stopping @@ -238,8 +258,8 @@ END DO CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( DNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( DNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO @@ -257,8 +277,8 @@ X2(I) = ONE CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( DNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( DNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f index 45c8ba8a2..3e356d001 100644 --- a/lapack-netlib/SRC/dorbdb6.f +++ b/lapack-netlib/SRC/dorbdb6.f @@ -41,9 +41,8 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The Euclidean norm of X must be one and the columns of Q must be -*> orthonormal. The orthogonalized vector will be zero if and only if it -*> lies entirely in the range of Q. +*> The columns of Q must be orthonormal. The orthogonalized vector will +*> be zero if and only if it lies entirely in the range of Q. *> *> The projection is computed with at most two iterations of the *> classical Gram-Schmidt algorithm, see @@ -152,7 +151,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup unbdb6 * * ===================================================================== SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -174,7 +173,7 @@ * * .. Parameters .. DOUBLE PRECISION ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, + PARAMETER ( ALPHA = 0.83D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) DOUBLE PRECISION NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 ) @@ -222,14 +221,16 @@ * EPS = DLAMCH( 'Precision' ) * +* Compute the Euclidean norm of X +* + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) +* * First, project X onto the orthogonal complement of Q's column * space -* -* Christoph Conrads: In debugging mode the norm should be computed -* and an assertion added comparing the norm with one. Alas, Fortran -* never made it into 1989 when assert() was introduced into the C -* programming language. - NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N diff --git a/lapack-netlib/SRC/dorgtsqr.f b/lapack-netlib/SRC/dorgtsqr.f index 34d96b238..2e5cd40be 100644 --- a/lapack-netlib/SRC/dorgtsqr.f +++ b/lapack-netlib/SRC/dorgtsqr.f @@ -133,6 +133,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= (M+NB)*N. *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK @@ -301,4 +302,4 @@ * * End of DORGTSQR * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/dpbsv.f b/lapack-netlib/SRC/dpbsv.f index a52e78309..2d8f06440 100644 --- a/lapack-netlib/SRC/dpbsv.f +++ b/lapack-netlib/SRC/dpbsv.f @@ -119,9 +119,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dpbsvx.f b/lapack-netlib/SRC/dpbsvx.f index 1bf526fc7..142dda5da 100644 --- a/lapack-netlib/SRC/dpbsvx.f +++ b/lapack-netlib/SRC/dpbsvx.f @@ -71,7 +71,7 @@ *> where U is an upper triangular band matrix, and L is a lower *> triangular band matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -281,10 +281,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/dpbtf2.f b/lapack-netlib/SRC/dpbtf2.f index 534629294..1284c9ec8 100644 --- a/lapack-netlib/SRC/dpbtf2.f +++ b/lapack-netlib/SRC/dpbtf2.f @@ -97,8 +97,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpbtrf.f b/lapack-netlib/SRC/dpbtrf.f index 8256f8938..29e9aaecd 100644 --- a/lapack-netlib/SRC/dpbtrf.f +++ b/lapack-netlib/SRC/dpbtrf.f @@ -92,8 +92,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpftrf.f b/lapack-netlib/SRC/dpftrf.f index 980debaf2..312dcdf84 100644 --- a/lapack-netlib/SRC/dpftrf.f +++ b/lapack-netlib/SRC/dpftrf.f @@ -91,8 +91,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dposv.f b/lapack-netlib/SRC/dposv.f index ee2988e6f..cb76e9977 100644 --- a/lapack-netlib/SRC/dposv.f +++ b/lapack-netlib/SRC/dposv.f @@ -110,9 +110,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dposvx.f b/lapack-netlib/SRC/dposvx.f index 4a0b9d605..faffff803 100644 --- a/lapack-netlib/SRC/dposvx.f +++ b/lapack-netlib/SRC/dposvx.f @@ -71,7 +71,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -277,10 +277,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/dposvxx.f b/lapack-netlib/SRC/dposvxx.f index e74b23c80..b5336cf63 100644 --- a/lapack-netlib/SRC/dposvxx.f +++ b/lapack-netlib/SRC/dposvxx.f @@ -88,7 +88,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A (see argument RCOND). If the reciprocal of the condition number diff --git a/lapack-netlib/SRC/dpotf2.f b/lapack-netlib/SRC/dpotf2.f index 08fa4957f..30da5c3f3 100644 --- a/lapack-netlib/SRC/dpotf2.f +++ b/lapack-netlib/SRC/dpotf2.f @@ -89,8 +89,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpotrf.f b/lapack-netlib/SRC/dpotrf.f index 1679fc3cd..65509feb8 100644 --- a/lapack-netlib/SRC/dpotrf.f +++ b/lapack-netlib/SRC/dpotrf.f @@ -87,8 +87,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpotrf2.f b/lapack-netlib/SRC/dpotrf2.f index 6c28ce6d6..aaf9b9c58 100644 --- a/lapack-netlib/SRC/dpotrf2.f +++ b/lapack-netlib/SRC/dpotrf2.f @@ -86,8 +86,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dppsv.f b/lapack-netlib/SRC/dppsv.f index 435703b08..1888005d9 100644 --- a/lapack-netlib/SRC/dppsv.f +++ b/lapack-netlib/SRC/dppsv.f @@ -104,9 +104,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dppsvx.f b/lapack-netlib/SRC/dppsvx.f index cb41d39ee..3b08fc821 100644 --- a/lapack-netlib/SRC/dppsvx.f +++ b/lapack-netlib/SRC/dppsvx.f @@ -69,7 +69,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -262,10 +262,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/dpptrf.f b/lapack-netlib/SRC/dpptrf.f index 2d8de5110..d9eed910d 100644 --- a/lapack-netlib/SRC/dpptrf.f +++ b/lapack-netlib/SRC/dpptrf.f @@ -79,8 +79,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpteqr.f b/lapack-netlib/SRC/dpteqr.f index aa1f1a80c..d07b065b0 100644 --- a/lapack-netlib/SRC/dpteqr.f +++ b/lapack-netlib/SRC/dpteqr.f @@ -123,8 +123,8 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, and i is: *> <= N the Cholesky factorization of the matrix could -*> not be performed because the i-th principal minor -*> was not positive definite. +*> not be performed because the leading principal +*> minor of order i was not positive. *> > N the SVD algorithm failed to converge; *> if INFO = N+i, i off-diagonal elements of the *> bidiagonal factor did not converge to zero. diff --git a/lapack-netlib/SRC/dptsv.f b/lapack-netlib/SRC/dptsv.f index addc34b88..41d8cff15 100644 --- a/lapack-netlib/SRC/dptsv.f +++ b/lapack-netlib/SRC/dptsv.f @@ -93,8 +93,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the solution has not been +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the solution has not been *> computed. The factorization has not been completed *> unless i = N. *> \endverbatim diff --git a/lapack-netlib/SRC/dptsvx.f b/lapack-netlib/SRC/dptsvx.f index 7fb6cf436..fcbf5aa3b 100644 --- a/lapack-netlib/SRC/dptsvx.f +++ b/lapack-netlib/SRC/dptsvx.f @@ -59,7 +59,7 @@ *> factorization can also be regarded as having the form *> A = U**T*D*U. *> -*> 2. If the leading i-by-i principal minor is not positive definite, +*> 2. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -199,10 +199,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/dpttrf.f b/lapack-netlib/SRC/dpttrf.f index e0022e3ad..156e845a3 100644 --- a/lapack-netlib/SRC/dpttrf.f +++ b/lapack-netlib/SRC/dpttrf.f @@ -70,8 +70,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite; if k < N, the factorization could not +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive; if k < N, the factorization could not *> be completed, while if k = N, the factorization was *> completed, but D(N) <= 0. *> \endverbatim diff --git a/lapack-netlib/SRC/dsbevd.f b/lapack-netlib/SRC/dsbevd.f index 3eb4ed8df..350c0a9f0 100644 --- a/lapack-netlib/SRC/dsbevd.f +++ b/lapack-netlib/SRC/dsbevd.f @@ -40,12 +40,6 @@ *> a real symmetric band matrix A. If eigenvectors are desired, it uses *> a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsbevd_2stage.f b/lapack-netlib/SRC/dsbevd_2stage.f index 45a64b478..82997c850 100644 --- a/lapack-netlib/SRC/dsbevd_2stage.f +++ b/lapack-netlib/SRC/dsbevd_2stage.f @@ -45,12 +45,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses *> a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsbgvd.f b/lapack-netlib/SRC/dsbgvd.f index 30b016611..0ab3177ac 100644 --- a/lapack-netlib/SRC/dsbgvd.f +++ b/lapack-netlib/SRC/dsbgvd.f @@ -43,12 +43,6 @@ *> banded, and B is also positive definite. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsbgvx.f b/lapack-netlib/SRC/dsbgvx.f index 20de17931..53deae580 100644 --- a/lapack-netlib/SRC/dsbgvx.f +++ b/lapack-netlib/SRC/dsbgvx.f @@ -267,7 +267,7 @@ *> Their indices are stored in IFAIL. *> > N: DPBSTF returned an error code; i.e., *> if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dspevd.f b/lapack-netlib/SRC/dspevd.f index d9d6c8917..05aa91b03 100644 --- a/lapack-netlib/SRC/dspevd.f +++ b/lapack-netlib/SRC/dspevd.f @@ -40,12 +40,6 @@ *> of a real symmetric matrix A in packed storage. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dspgv.f b/lapack-netlib/SRC/dspgv.f index d8ec3b1a4..96041c301 100644 --- a/lapack-netlib/SRC/dspgv.f +++ b/lapack-netlib/SRC/dspgv.f @@ -139,7 +139,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero. *> > N: if INFO = n + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f index df215ae1a..24c2309c3 100644 --- a/lapack-netlib/SRC/dspgvd.f +++ b/lapack-netlib/SRC/dspgvd.f @@ -44,12 +44,6 @@ *> positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -184,7 +178,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dspgvx.f b/lapack-netlib/SRC/dspgvx.f index ec93147aa..5afd73d02 100644 --- a/lapack-netlib/SRC/dspgvx.f +++ b/lapack-netlib/SRC/dspgvx.f @@ -245,7 +245,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsposv.f b/lapack-netlib/SRC/dsposv.f index c3b8de0e3..0bd75698d 100644 --- a/lapack-netlib/SRC/dsposv.f +++ b/lapack-netlib/SRC/dsposv.f @@ -177,8 +177,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of (DOUBLE -*> PRECISION) A is not positive definite, so the +*> > 0: if INFO = i, the leading principal minor of order i +*> of (DOUBLE PRECISION) A is not positive, so the *> factorization could not be completed, and the solution *> has not been computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dstedc.f b/lapack-netlib/SRC/dstedc.f index 2ed84afaa..6d533664b 100644 --- a/lapack-netlib/SRC/dstedc.f +++ b/lapack-netlib/SRC/dstedc.f @@ -42,12 +42,6 @@ *> found if DSYTRD or DSPTRD or DSBTRD has been used to reduce this *> matrix to tridiagonal form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See DLAED3 for details. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dstegr.f b/lapack-netlib/SRC/dstegr.f index 01ec101d8..598c60e8d 100644 --- a/lapack-netlib/SRC/dstegr.f +++ b/lapack-netlib/SRC/dstegr.f @@ -56,7 +56,7 @@ *> *> Note : DSTEGR and DSTEMR work only on machines which follow *> IEEE-754 floating-point standard in their handling of infinities and -*> NaNs. Normal execution may create these exceptiona values and hence +*> NaNs. Normal execution may create these exceptional values and hence *> may abort due to a floating point exception in environments which *> do not conform to the IEEE-754 standard. *> \endverbatim diff --git a/lapack-netlib/SRC/dstemr.f b/lapack-netlib/SRC/dstemr.f index d0c71ddd9..44a33423e 100644 --- a/lapack-netlib/SRC/dstemr.f +++ b/lapack-netlib/SRC/dstemr.f @@ -303,7 +303,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup stemr * *> \par Contributors: * ================== @@ -312,7 +312,8 @@ *> Jim Demmel, University of California, Berkeley, USA \n *> Inderjit Dhillon, University of Texas, Austin, USA \n *> Osni Marques, LBNL/NERSC, USA \n -*> Christof Voemel, University of California, Berkeley, USA +*> Christof Voemel, University of California, Berkeley, USA \n +*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n * * ===================================================================== SUBROUTINE DSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, @@ -344,7 +345,8 @@ $ MINRGP = 1.0D-3 ) * .. * .. Local Scalars .. - LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY + LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY, + $ LAESWAP INTEGER I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW, $ IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD, $ INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP, @@ -380,6 +382,7 @@ * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) + LAESWAP = .FALSE. * DSTEMR needs WORK of size 6*N, IWORK of size 3*N. * In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N. @@ -502,6 +505,15 @@ ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN ) END IF +* D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However, +* the following code requires R1 >= R2. Hence, we correct +* the order of R1, R2, CS, SN if R1 < R2 before further processing. + IF( R1.LT.R2 ) THEN + E(2) = R1 + R1 = R2 + R2 = E(2) + LAESWAP = .TRUE. + ENDIF IF( ALLEIG.OR. $ (VALEIG.AND.(R2.GT.WL).AND. $ (R2.LE.WU)).OR. @@ -509,8 +521,13 @@ M = M+1 W( M ) = R2 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = -SN - Z( 2, M ) = CS + IF( LAESWAP ) THEN + Z( 1, M ) = CS + Z( 2, M ) = SN + ELSE + Z( 1, M ) = -SN + Z( 2, M ) = CS + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -533,8 +550,13 @@ M = M+1 W( M ) = R1 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = CS - Z( 2, M ) = SN + IF( LAESWAP ) THEN + Z( 1, M ) = -SN + Z( 2, M ) = CS + ELSE + Z( 1, M ) = CS + Z( 2, M ) = SN + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN diff --git a/lapack-netlib/SRC/dstevd.f b/lapack-netlib/SRC/dstevd.f index 507f39b2b..54717df3d 100644 --- a/lapack-netlib/SRC/dstevd.f +++ b/lapack-netlib/SRC/dstevd.f @@ -40,12 +40,6 @@ *> real symmetric tridiagonal matrix. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsyconvf.f b/lapack-netlib/SRC/dsyconvf.f index 9e7a5af0e..005c98a03 100644 --- a/lapack-netlib/SRC/dsyconvf.f +++ b/lapack-netlib/SRC/dsyconvf.f @@ -39,7 +39,7 @@ *> DSYTRF provided on entry in parameter A into the factorization *> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored *> on exit in parameters A and E. It also converts in place details of -*> the intechanges stored in IPIV from the format used in DSYTRF into +*> the interchanges stored in IPIV from the format used in DSYTRF into *> the format used in DSYTRF_RK (or DSYTRF_BK). *> *> If parameter WAY = 'R': @@ -48,7 +48,7 @@ *> (or DSYTRF_BK) provided on entry in parameters A and E into *> the factorization output format used in DSYTRF that is stored *> on exit in parameter A. It also converts in place details of -*> the intechanges stored in IPIV from the format used in DSYTRF_RK +*> the interchanges stored in IPIV from the format used in DSYTRF_RK *> (or DSYTRF_BK) into the format used in DSYTRF. *> \endverbatim * @@ -322,7 +322,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -466,7 +466,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -532,7 +532,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/dsyconvf_rook.f b/lapack-netlib/SRC/dsyconvf_rook.f index d7f529358..c3f2083bd 100644 --- a/lapack-netlib/SRC/dsyconvf_rook.f +++ b/lapack-netlib/SRC/dsyconvf_rook.f @@ -517,7 +517,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f index 50d51d992..286366bfe 100644 --- a/lapack-netlib/SRC/dsyev_2stage.f +++ b/lapack-netlib/SRC/dsyev_2stage.f @@ -20,7 +20,7 @@ * Definition: * =========== * -* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, * INFO ) * * IMPLICIT NONE @@ -97,7 +97,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -105,12 +105,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -143,7 +143,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -161,7 +161,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -169,16 +169,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== - SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, $ INFO ) * IMPLICIT NONE @@ -305,7 +305,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call DSTERF. For eigenvectors, first call diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index eaaecd8d9..adcfcb373 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -40,13 +40,6 @@ *> real symmetric matrix A. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. -*> *> Because of large use of BLAS of level 3, DSYEVD needs N**2 more *> workspace than DSYEVX. *> \endverbatim @@ -103,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, -*> dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -167,7 +159,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevd * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dsyevd_2stage.f b/lapack-netlib/SRC/dsyevd_2stage.f index 0eae8ad06..d5a68c35d 100644 --- a/lapack-netlib/SRC/dsyevd_2stage.f +++ b/lapack-netlib/SRC/dsyevd_2stage.f @@ -45,12 +45,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsyevr.f b/lapack-netlib/SRC/dsyevr.f index 698691533..8647b0162 100644 --- a/lapack-netlib/SRC/dsyevr.f +++ b/lapack-netlib/SRC/dsyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for DSYTRD and DORMTR *> returned by ILAENV. @@ -285,13 +286,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -315,7 +317,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -390,8 +392,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -450,7 +457,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) diff --git a/lapack-netlib/SRC/dsyevr_2stage.f b/lapack-netlib/SRC/dsyevr_2stage.f index 09242bbd3..63d5e3159 100644 --- a/lapack-netlib/SRC/dsyevr_2stage.f +++ b/lapack-netlib/SRC/dsyevr_2stage.f @@ -263,7 +263,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the orthogonal transformations applied by DORMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -277,12 +277,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 5*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -330,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -358,7 +360,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -366,11 +368,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -444,8 +446,14 @@ IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -484,7 +492,7 @@ * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN END IF * @@ -504,7 +512,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -608,7 +616,7 @@ * Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. * * - CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) * @@ -727,7 +735,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/dsyevx.f b/lapack-netlib/SRC/dsyevx.f index 99719874b..fd6a78e32 100644 --- a/lapack-netlib/SRC/dsyevx.f +++ b/lapack-netlib/SRC/dsyevx.f @@ -244,7 +244,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE DSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT END IF + WORK( 1 ) = LWKOPT * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/dsygv.f b/lapack-netlib/SRC/dsygv.f index 5208dbb1f..02a4cc3ed 100644 --- a/lapack-netlib/SRC/dsygv.f +++ b/lapack-netlib/SRC/dsygv.f @@ -154,7 +154,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsygv_2stage.f b/lapack-netlib/SRC/dsygv_2stage.f index 5c71ebf94..383304267 100644 --- a/lapack-netlib/SRC/dsygv_2stage.f +++ b/lapack-netlib/SRC/dsygv_2stage.f @@ -173,7 +173,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f index 3b38665a7..41a384c80 100644 --- a/lapack-netlib/SRC/dsygvd.f +++ b/lapack-netlib/SRC/dsygvd.f @@ -42,12 +42,6 @@ *> B are assumed to be symmetric and B is also positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -190,7 +184,7 @@ *> the submatrix lying in rows and columns INFO/(N+1) *> through mod(INFO,N+1); *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsygvx.f b/lapack-netlib/SRC/dsygvx.f index 3fa55b97c..2dc27e8a8 100644 --- a/lapack-netlib/SRC/dsygvx.f +++ b/lapack-netlib/SRC/dsygvx.f @@ -270,7 +270,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index 8dab5a384..0a96ecd7e 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index 1100702ba..90dd0a38a 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -89,7 +89,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -101,14 +101,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -148,14 +148,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -179,7 +180,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -205,7 +206,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -226,6 +227,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +236,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +258,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, diff --git a/lapack-netlib/SRC/dsysvx.f b/lapack-netlib/SRC/dsysvx.f index a30831e72..b2b8210ca 100644 --- a/lapack-netlib/SRC/dsysvx.f +++ b/lapack-netlib/SRC/dsysvx.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE DSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/dsytrd.f b/lapack-netlib/SRC/dsytrd.f index 3dcfc3db2..58d4b633b 100644 --- a/lapack-netlib/SRC/dsytrd.f +++ b/lapack-netlib/SRC/dsytrd.f @@ -139,7 +139,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd * *> \par Further Details: * ===================== @@ -247,7 +247,7 @@ * Determine the block size. * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f index 8ae77d3e4..a88ac1c73 100644 --- a/lapack-netlib/SRC/dsytrd_2stage.f +++ b/lapack-netlib/SRC/dsytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download DSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download DSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * DOUBLE PRECISION A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is DOUBLE PRECISION array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2) +*> HOUS2 is DOUBLE PRECISION array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SY2SB', -INFO ) RETURN END IF - CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SB2ST', -INFO ) @@ -324,8 +332,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_2STAGE diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index fabc10756..04d03d587 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the dsytrd_sy2sb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the dsytrd_sy2sb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the dsytrd_sy2sb *> routine has been called to produce AB (e.g., AB is *> the output of dsytrd_sy2sb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is DOUBLE PRECISION array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is DOUBLE PRECISION array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK. +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup real16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * #if defined(_OPENMP) @@ -258,11 +263,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIDEV, SIZETAU, LDV, LHMIN, LWMIN * .. @@ -274,7 +279,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -282,7 +287,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -291,9 +295,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -355,7 +364,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -363,11 +372,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* real because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -382,17 +391,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = ( AB( ABDPOS, I ) ) @@ -413,7 +422,7 @@ RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the symmetric band of A to a tridiagonal matrix. * THGRSIZ = N @@ -422,7 +431,7 @@ NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL DLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -431,7 +440,7 @@ * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -440,7 +449,7 @@ #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -451,7 +460,7 @@ ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -477,16 +486,16 @@ ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -494,20 +503,20 @@ !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, - $ WORK( INDW + TID*KD ) ) -#endif + $ WORK( INDW ) ) +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -522,14 +531,14 @@ !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -543,11 +552,10 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_SB2ST * END - + diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index 1660b5c7e..38acc71f1 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', ' ', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f index a39b03283..2a1a2d4dc 100644 --- a/lapack-netlib/SRC/dsytrf.f +++ b/lapack-netlib/SRC/dsytrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * @@ -352,6 +352,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f index 346737953..924d4c165 100644 --- a/lapack-netlib/SRC/dsytrf_aa.f +++ b/lapack-netlib/SRC/dsytrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB DOUBLE PRECISION ALPHA * .. @@ -179,18 +181,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index b7f5f07c2..fae95bab2 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -211,9 +211,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -227,10 +227,10 @@ NB = ILAENV( 1, 'DSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -239,7 +239,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f index 7341b9263..0717eb076 100644 --- a/lapack-netlib/SRC/dsytrf_rk.f +++ b/lapack-netlib/SRC/dsytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension ( MAX(1,LWORK) ). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'DSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsytrf_rook.f b/lapack-netlib/SRC/dsytrf_rook.f index 2f00d1802..316663485 100644 --- a/lapack-netlib/SRC/dsytrf_rook.f +++ b/lapack-netlib/SRC/dsytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index dbcdcdb58..5960d3992 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL DSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL DSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of DSYTRI2 diff --git a/lapack-netlib/SRC/dsytri_3.f b/lapack-netlib/SRC/dsytri_3.f index 86d69cdfd..50834c605 100644 --- a/lapack-netlib/SRC/dsytri_3.f +++ b/lapack-netlib/SRC/dsytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -152,7 +153,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -208,8 +209,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = LWKOPT * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -217,7 +223,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -225,7 +231,6 @@ CALL XERBLA( 'DSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f index 26b11a2a0..f0016cb7f 100644 --- a/lapack-netlib/SRC/dsytrs_aa.f +++ b/lapack-netlib/SRC/dsytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -123,7 +129,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -151,7 +157,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -161,13 +167,19 @@ EXTERNAL DLACPY, DGTSV, DSWAP, DTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -178,21 +190,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/dtgevc.f b/lapack-netlib/SRC/dtgevc.f index e7084664c..be70b2083 100644 --- a/lapack-netlib/SRC/dtgevc.f +++ b/lapack-netlib/SRC/dtgevc.f @@ -52,7 +52,7 @@ *> *> S*x = w*P*x, (y**H)*S = w*(y**H)*P, *> -*> where y**H denotes the conjugate tranpose of y. +*> where y**H denotes the conjugate transpose of y. *> The eigenvalues are not input to this routine, but are computed *> directly from the diagonal blocks of S and P. *> @@ -337,7 +337,7 @@ EXTERNAL LSAME, DLAMCH * .. * .. External Subroutines .. - EXTERNAL DGEMV, DLABAD, DLACPY, DLAG2, DLALN2, XERBLA + EXTERNAL DGEMV, DLACPY, DLAG2, DLALN2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -463,7 +463,6 @@ * SAFMIN = DLAMCH( 'Safe minimum' ) BIG = ONE / SAFMIN - CALL DLABAD( SAFMIN, BIG ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) SMALL = SAFMIN*N / ULP BIG = ONE / SMALL diff --git a/lapack-netlib/SRC/dtgex2.f b/lapack-netlib/SRC/dtgex2.f index 00c23a833..1ccd92c8f 100644 --- a/lapack-netlib/SRC/dtgex2.f +++ b/lapack-netlib/SRC/dtgex2.f @@ -254,7 +254,7 @@ $ THRESHA, THRESHB * .. * .. Local Arrays .. - INTEGER IWORK( LDST ) + INTEGER IWORK( LDST + 2 ) DOUBLE PRECISION AI( 2 ), AR( 2 ), BE( 2 ), IR( LDST, LDST ), $ IRCOP( LDST, LDST ), LI( LDST, LDST ), $ LICOP( LDST, LDST ), S( LDST, LDST ), diff --git a/lapack-netlib/SRC/dtgsna.f b/lapack-netlib/SRC/dtgsna.f index b9b3ad8af..013dc91bd 100644 --- a/lapack-netlib/SRC/dtgsna.f +++ b/lapack-netlib/SRC/dtgsna.f @@ -632,8 +632,8 @@ C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA ) C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI ROOT1 = C1 + SQRT( C1*C1-4.0D0*C2 ) - ROOT2 = C2 / ROOT1 ROOT1 = ROOT1 / TWO + ROOT2 = C2 / ROOT1 COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) ) END IF * diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f index a4651e788..c8c04ad13 100644 --- a/lapack-netlib/SRC/dtrevc3.f +++ b/lapack-netlib/SRC/dtrevc3.f @@ -298,7 +298,7 @@ * INFO = 0 NB = ILAENV( 1, 'DTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f index c44ec3808..31a5230ba 100644 --- a/lapack-netlib/SRC/dtrsyl3.f +++ b/lapack-netlib/SRC/dtrsyl3.f @@ -1220,7 +1220,7 @@ * SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) BUF = BUF * SCALOC - CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) ) END IF * * Combine with buffer scaling factor. SCALE will be flushed if diff --git a/lapack-netlib/SRC/ilaenv.c b/lapack-netlib/SRC/ilaenv.c index c47224a0c..8f3b2db8e 100644 --- a/lapack-netlib/SRC/ilaenv.c +++ b/lapack-netlib/SRC/ilaenv.c @@ -191,7 +191,7 @@ typedef struct Namelist Namelist; #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -252,11 +252,11 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; -#define mycycle() continue; -#define myceiling(w) {ceil(w)} -#define myhuge(w) {HUGE_VAL} +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} -#define mymaxloc(w,s,e,n) {dmaxloc_(w,*(s),*(e),n)} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) /* procedure parameter types for -A and -C++ */ @@ -509,12 +509,18 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + /* Table of constant values */ static integer c__1 = 1; -static real c_b174 = 0.f; -static real c_b175 = 1.f; +static real c_b179 = 0.f; +static real c_b180 = 1.f; static integer c__0 = 0; /* > \brief \b ILAENV */ @@ -599,9 +605,9 @@ f"> */ /* > = 9: maximum size of the subproblems at the bottom of the */ /* > computation tree in the divide-and-conquer algorithm */ /* > (used by xGELSD and xGESDD) */ -/* > =10: ieee NaN arithmetic can be trusted not to trap */ +/* > =10: ieee infinity and NaN arithmetic can be trusted not to trap */ /* > =11: infinity arithmetic can be trusted not to trap */ -/* > 12 <= ISPEC <= 16: */ +/* > 12 <= ISPEC <= 17: */ /* > xHSEQR or related subroutines, */ /* > see IPARMQ for detailed explanation */ /* > \endverbatim */ @@ -652,9 +658,7 @@ f"> */ /* > \author Univ. of Colorado Denver */ /* > \author NAG Ltd. */ -/* > \date November 2019 */ - -/* > \ingroup OTHERauxiliary */ +/* > \ingroup ilaenv */ /* > \par Further Details: */ /* ===================== */ @@ -685,7 +689,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, opts_len) { /* System generated locals */ - integer ret_val; + integer ret_val, i__1, i__2, i__3; /* Local variables */ logical twostage; @@ -702,10 +706,9 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, integer *, integer *); -/* -- LAPACK auxiliary routine (version 3.9.0) -- */ +/* -- LAPACK auxiliary routine -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ /* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ -/* November 2019 */ /* ===================================================================== */ @@ -728,6 +731,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, case 14: goto L160; case 15: goto L160; case 16: goto L160; + case 17: goto L160; } /* Invalid value for ISPEC */ @@ -908,6 +912,12 @@ L50: } else { nb = 64; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nb = 32; + } else { + nb = 32; + } } } else if (s_cmp(c2, "PO", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) { @@ -1034,6 +1044,21 @@ L50: } else { nb = 64; } + } else if (s_cmp(c3, "SYL", (ftnlen)3, (ftnlen)3) == 0) { +/* The upper bound is to prevent overly aggressive scaling. */ + if (sname) { +/* Computing MIN */ +/* Computing MAX */ + i__2 = 48, i__3 = (f2cmin(*n1,*n2) << 4) / 100; + i__1 = f2cmax(i__2,i__3); + nb = f2cmin(i__1,240); + } else { +/* Computing MIN */ +/* Computing MAX */ + i__2 = 24, i__3 = (f2cmin(*n1,*n2) << 3) / 100; + i__1 = f2cmax(i__2,i__3); + nb = f2cmin(i__1,80); + } } } else if (s_cmp(c2, "LA", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "UUM", (ftnlen)3, (ftnlen)3) == 0) { @@ -1042,6 +1067,12 @@ L50: } else { nb = 64; } + } else if (s_cmp(c3, "TRS", (ftnlen)3, (ftnlen)3) == 0) { + if (sname) { + nb = 32; + } else { + nb = 32; + } } } else if (sname && s_cmp(c2, "ST", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "EBZ", (ftnlen)3, (ftnlen)3) == 0) { @@ -1093,6 +1124,12 @@ L60: } else { nbmin = 2; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nbmin = 2; + } else { + nbmin = 2; + } } } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) { @@ -1184,6 +1221,12 @@ L70: } else { nx = 128; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nx = 128; + } else { + nx = 128; + } } } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) { if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) { @@ -1270,29 +1313,29 @@ L130: L140: -/* ISPEC = 10: ieee NaN arithmetic can be trusted not to trap */ +/* ISPEC = 10: ieee and infinity NaN arithmetic can be trusted not to trap */ /* ILAENV = 0 */ ret_val = 1; if (ret_val == 1) { - ret_val = ieeeck_(&c__1, &c_b174, &c_b175); + ret_val = ieeeck_(&c__1, &c_b179, &c_b180); } return ret_val; L150: -/* ISPEC = 11: infinity arithmetic can be trusted not to trap */ +/* ISPEC = 11: ieee infinity arithmetic can be trusted not to trap */ /* ILAENV = 0 */ ret_val = 1; if (ret_val == 1) { - ret_val = ieeeck_(&c__0, &c_b174, &c_b175); + ret_val = ieeeck_(&c__0, &c_b179, &c_b180); } return ret_val; L160: -/* 12 <= ISPEC <= 16: xHSEQR or related subroutines. */ +/* 12 <= ISPEC <= 17: xHSEQR or related subroutines. */ ret_val = iparmq_(ispec, name__, opts, n1, n2, n3, n4) ; diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f index a639e0375..e74a2b35e 100644 --- a/lapack-netlib/SRC/ilaenv.f +++ b/lapack-netlib/SRC/ilaenv.f @@ -132,7 +132,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup OTHERauxiliary +*> \ingroup ilaenv * *> \par Further Details: * ===================== @@ -355,6 +355,12 @@ ELSE NB = 64 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF END IF ELSE IF( C2.EQ.'PO' ) THEN IF( C3.EQ.'TRF' ) THEN @@ -541,7 +547,14 @@ ELSE NBMIN = 2 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NBMIN = 2 + ELSE + NBMIN = 2 + END IF END IF + ELSE IF( C2.EQ.'SY' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN @@ -618,6 +631,12 @@ ELSE NX = 128 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NX = 128 + ELSE + NX = 128 + END IF END IF ELSE IF( C2.EQ.'SY' ) THEN IF( SNAME .AND. C3.EQ.'TRD' ) THEN diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F index c701c2be0..52d507e9d 100644 --- a/lapack-netlib/SRC/iparam2stage.F +++ b/lapack-netlib/SRC/iparam2stage.F @@ -89,14 +89,14 @@ *> *> \param[in] NBI *> \verbatim -*> NBI is INTEGER which is the used in the reduciton, +*> NBI is INTEGER which is the used in the reduction, *> (e.g., the size of the band), needed to compute workspace *> and LHOUS2. *> \endverbatim *> *> \param[in] IBI *> \verbatim -*> IBI is INTEGER which represent the IB of the reduciton, +*> IBI is INTEGER which represent the IB of the reduction, *> needed to compute workspace and LHOUS2. *> \endverbatim *> diff --git a/lapack-netlib/SRC/sbbcsd.f b/lapack-netlib/SRC/sbbcsd.f index d5720cb33..2a619cb71 100644 --- a/lapack-netlib/SRC/sbbcsd.f +++ b/lapack-netlib/SRC/sbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/sbdsdc.f b/lapack-netlib/SRC/sbdsdc.f index 18a404497..2a6cc9970 100644 --- a/lapack-netlib/SRC/sbdsdc.f +++ b/lapack-netlib/SRC/sbdsdc.f @@ -45,13 +45,6 @@ *> respectively. SBDSDC can be used to compute all singular values, *> and optionally, singular vectors or singular vectors in compact form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See SLASD3 for details. -*> *> The code currently calls SLASDQ if singular values only are desired. *> However, it can be slightly modified to compute singular values *> using the divide and conquer method. diff --git a/lapack-netlib/SRC/sbdsqr.f b/lapack-netlib/SRC/sbdsqr.f index c798baaf5..880f0607b 100644 --- a/lapack-netlib/SRC/sbdsqr.f +++ b/lapack-netlib/SRC/sbdsqr.f @@ -277,7 +277,7 @@ $ MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM REAL ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, - $ SINR, SLL, SMAX, SMIN, SMINL, SMINOA, + $ SINR, SLL, SMAX, SMIN, SMINOA, $ SN, THRESH, TOL, TOLMUL, UNFL * .. * .. External Functions .. @@ -390,7 +390,7 @@ DO 30 I = 1, N - 1 SMAX = MAX( SMAX, ABS( E( I ) ) ) 30 CONTINUE - SMINL = ZERO + SMIN = ZERO IF( TOL.GE.ZERO ) THEN * * Relative accuracy desired @@ -450,7 +450,6 @@ IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH ) $ D( M ) = ZERO SMAX = ABS( D( M ) ) - SMIN = SMAX DO 70 LLL = 1, M - 1 LL = M - LLL ABSS = ABS( D( LL ) ) @@ -459,7 +458,6 @@ $ D( LL ) = ZERO IF( ABSE.LE.THRESH ) $ GO TO 80 - SMIN = MIN( SMIN, ABSS ) SMAX = MAX( SMAX, ABSS, ABSE ) 70 CONTINUE LL = 0 @@ -541,14 +539,14 @@ * apply convergence criterion forward * MU = ABS( D( LL ) ) - SMINL = MU + SMIN = MU DO 100 LLL = LL, M - 1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 100 CONTINUE END IF * @@ -569,14 +567,14 @@ * apply convergence criterion backward * MU = ABS( D( M ) ) - SMINL = MU + SMIN = MU DO 110 LLL = M - 1, LL, -1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 110 CONTINUE END IF END IF @@ -586,7 +584,7 @@ * Compute shift. First, test if shifting would ruin relative * accuracy, and if so set the shift to zero. * - IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE. + IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE. $ MAX( EPS, HNDRTH*TOL ) ) THEN * * Use a zero shift to avoid loss of relative accuracy diff --git a/lapack-netlib/SRC/sbdsvdx.f b/lapack-netlib/SRC/sbdsvdx.f index dcd554f1d..d7cb2dc83 100644 --- a/lapack-netlib/SRC/sbdsvdx.f +++ b/lapack-netlib/SRC/sbdsvdx.f @@ -45,7 +45,7 @@ *> *> Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ] *> and superdiagonal E = [ e_1 e_2 ... e_N-1 ], SBDSVDX computes the -*> singular value decompositon of B through the eigenvalues and +*> singular value decomposition of B through the eigenvalues and *> eigenvectors of the N*2-by-N*2 tridiagonal matrix *> *> | 0 d_1 | diff --git a/lapack-netlib/SRC/sgebal.f b/lapack-netlib/SRC/sgebal.f index f519c8c57..7c115fb6c 100644 --- a/lapack-netlib/SRC/sgebal.f +++ b/lapack-netlib/SRC/sgebal.f @@ -153,6 +153,9 @@ *> *> Modified by Tzu-Yi Chen, Computer Science Division, University of *> California at Berkeley, USA +*> +*> Refactored by Evert Provoost, Department of Computer Science, +*> KU Leuven, Belgium *> \endverbatim *> * ===================================================================== @@ -181,8 +184,8 @@ PARAMETER ( FACTOR = 0.95E+0 ) * .. * .. Local Scalars .. - LOGICAL NOCONV - INTEGER I, ICA, IEXC, IRA, J, K, L, M + LOGICAL NOCONV, CANSWAP + INTEGER I, ICA, IRA, J, K, L REAL C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2 * .. @@ -197,7 +200,7 @@ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN -* +* .. * Test the input parameters * INFO = 0 @@ -214,176 +217,192 @@ RETURN END IF * - K = 1 - L = N +* Quick returns. * - IF( N.EQ.0 ) - $ GO TO 210 + IF( N.EQ.0 ) THEN + ILO = 1 + IHI = 0 + RETURN + END IF * IF( LSAME( JOB, 'N' ) ) THEN - DO 10 I = 1, N + DO I = 1, N SCALE( I ) = ONE - 10 CONTINUE - GO TO 210 + END DO + ILO = 1 + IHI = N + RETURN END IF * - IF( LSAME( JOB, 'S' ) ) - $ GO TO 120 -* -* Permutation to isolate eigenvalues if possible -* - GO TO 50 -* -* Row and column exchange. -* - 20 CONTINUE - SCALE( M ) = J - IF( J.EQ.M ) - $ GO TO 30 -* - CALL SSWAP( L, A( 1, J ), 1, A( 1, M ), 1 ) - CALL SSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA ) -* - 30 CONTINUE - GO TO ( 40, 80 )IEXC -* -* Search for rows isolating an eigenvalue and push them down. -* - 40 CONTINUE - IF( L.EQ.1 ) - $ GO TO 210 - L = L - 1 +* Permutation to isolate eigenvalues if possible. * - 50 CONTINUE - DO 70 J = L, 1, -1 -* - DO 60 I = 1, L - IF( I.EQ.J ) - $ GO TO 60 - IF( A( J, I ).NE.ZERO ) - $ GO TO 70 - 60 CONTINUE -* - M = L - IEXC = 1 - GO TO 20 - 70 CONTINUE -* - GO TO 90 + K = 1 + L = N * -* Search for columns isolating an eigenvalue and push them left. + IF( .NOT.LSAME( JOB, 'S' ) ) THEN * - 80 CONTINUE - K = K + 1 +* Row and column exchange. * - 90 CONTINUE - DO 110 J = K, L + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for rows isolating an eigenvalue and push them down. +* + NOCONV = .FALSE. + DO I = L, 1, -1 + CANSWAP = .TRUE. + DO J = 1, L + IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( L ) = I + IF( I.NE.L ) THEN + CALL SSWAP( L, A( 1, I ), 1, A( 1, L ), 1 ) + CALL SSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA ) + END IF + NOCONV = .TRUE. +* + IF( L.EQ.1 ) THEN + ILO = 1 + IHI = 1 + RETURN + END IF +* + L = L - 1 + END IF + END DO +* + END DO + + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for columns isolating an eigenvalue and push them left. +* + NOCONV = .FALSE. + DO J = K, L + CANSWAP = .TRUE. + DO I = K, L + IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( K ) = J + IF( J.NE.K ) THEN + CALL SSWAP( L, A( 1, J ), 1, A( 1, K ), 1 ) + CALL SSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA ) + END IF + NOCONV = .TRUE. +* + K = K + 1 + END IF + END DO +* + END DO * - DO 100 I = K, L - IF( I.EQ.J ) - $ GO TO 100 - IF( A( I, J ).NE.ZERO ) - $ GO TO 110 - 100 CONTINUE + END IF * - M = K - IEXC = 2 - GO TO 20 - 110 CONTINUE +* Initialize SCALE for non-permuted submatrix. * - 120 CONTINUE - DO 130 I = K, L + DO I = K, L SCALE( I ) = ONE - 130 CONTINUE + END DO * - IF( LSAME( JOB, 'P' ) ) - $ GO TO 210 +* If we only had to permute, we are done. +* + IF( LSAME( JOB, 'P' ) ) THEN + ILO = K + IHI = L + RETURN + END IF * * Balance the submatrix in rows K to L. * -* Iterative loop for norm reduction +* Iterative loop for norm reduction. * SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 - 140 CONTINUE - NOCONV = .FALSE. -* - DO 200 I = K, L -* - C = SNRM2( L-K+1, A( K, I ), 1 ) - R = SNRM2( L-K+1, A( I, K ), LDA ) - ICA = ISAMAX( L, A( 1, I ), 1 ) - CA = ABS( A( ICA, I ) ) - IRA = ISAMAX( N-K+1, A( I, K ), LDA ) - RA = ABS( A( I, IRA+K-1 ) ) -* -* Guard against zero C or R due to underflow. -* - IF( C.EQ.ZERO .OR. R.EQ.ZERO ) - $ GO TO 200 - G = R / SCLFAC - F = ONE - S = C + R - 160 CONTINUE - IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. - $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 - F = F*SCLFAC - C = C*SCLFAC - CA = CA*SCLFAC - R = R / SCLFAC - G = G / SCLFAC - RA = RA / SCLFAC - GO TO 160 -* - 170 CONTINUE - G = C / SCLFAC - 180 CONTINUE - IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. - $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 - IF( SISNAN( C+F+CA+R+G+RA ) ) THEN * -* Exit if NaN to avoid infinite loop + NOCONV = .TRUE. + DO WHILE( NOCONV ) + NOCONV = .FALSE. * - INFO = -3 - CALL XERBLA( 'SGEBAL', -INFO ) - RETURN - END IF - F = F / SCLFAC - C = C / SCLFAC - G = G / SCLFAC - CA = CA / SCLFAC - R = R*SCLFAC - RA = RA*SCLFAC - GO TO 180 -* -* Now balance. -* - 190 CONTINUE - IF( ( C+R ).GE.FACTOR*S ) - $ GO TO 200 - IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN - IF( F*SCALE( I ).LE.SFMIN1 ) - $ GO TO 200 - END IF - IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN - IF( SCALE( I ).GE.SFMAX1 / F ) - $ GO TO 200 - END IF - G = ONE / F - SCALE( I ) = SCALE( I )*F - NOCONV = .TRUE. + DO I = K, L +* + C = SNRM2( L-K+1, A( K, I ), 1 ) + R = SNRM2( L-K+1, A( I, K ), LDA ) + ICA = ISAMAX( L, A( 1, I ), 1 ) + CA = ABS( A( ICA, I ) ) + IRA = ISAMAX( N-K+1, A( I, K ), LDA ) + RA = ABS( A( I, IRA+K-1 ) ) * - CALL SSCAL( N-K+1, G, A( I, K ), LDA ) - CALL SSCAL( L, F, A( 1, I ), 1 ) +* Guard against zero C or R due to underflow. * - 200 CONTINUE + IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE +* +* Exit if NaN to avoid infinite loop * - IF( NOCONV ) - $ GO TO 140 + IF( SISNAN( C+CA+R+RA ) ) THEN + INFO = -3 + CALL XERBLA( 'SGEBAL', -INFO ) + RETURN + END IF +* + G = R / SCLFAC + F = ONE + S = C + R +* + DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND. + $ MIN( R, G, RA ).GT.SFMIN2 ) + F = F*SCLFAC + C = C*SCLFAC + CA = CA*SCLFAC + R = R / SCLFAC + G = G / SCLFAC + RA = RA / SCLFAC + END DO +* + G = C / SCLFAC +* + DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND. + $ MIN( F, C, G, CA ).GT.SFMIN2 ) + F = F / SCLFAC + C = C / SCLFAC + G = G / SCLFAC + CA = CA / SCLFAC + R = R*SCLFAC + RA = RA*SCLFAC + END DO +* +* Now balance. +* + IF( ( C+R ).GE.FACTOR*S ) CYCLE + IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN + IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE + END IF + IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN + IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE + END IF + G = ONE / F + SCALE( I ) = SCALE( I )*F + NOCONV = .TRUE. +* + CALL SSCAL( N-K+1, G, A( I, K ), LDA ) + CALL SSCAL( L, F, A( 1, I ), 1 ) +* + END DO +* + END DO * - 210 CONTINUE ILO = K IHI = L * diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f index 08701164c..b33ad0b1f 100644 --- a/lapack-netlib/SRC/sgebrd.f +++ b/lapack-netlib/SRC/sgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,27 +224,35 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL SGEBD2, SGEMM, SLABRD, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN, REAL + INTRINSIC MAX, MIN * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = REAL( LWKOPT ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -341,7 +349,8 @@ * CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = WS +* + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of SGEBRD diff --git a/lapack-netlib/SRC/sgecon.f b/lapack-netlib/SRC/sgecon.f index 86aeea73b..82f463ebb 100644 --- a/lapack-netlib/SRC/sgecon.f +++ b/lapack-netlib/SRC/sgecon.f @@ -105,8 +105,15 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> =-5: if ANORM is NAN or negative. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> NaNs are illegal values for ANORM, and they propagate to +*> the output parameter RCOND. +*> Infinity is illegal for ANORM, and it propagates to the output +*> parameter RCOND as 0. +*> = 1: if RCOND = NaN, or +*> RCOND = Inf, or +*> the computed norm of the inverse of A is 0. +*> In the latter, RCOND = 0 is returned. *> \endverbatim * * Authors: @@ -117,7 +124,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gecon * * ===================================================================== SUBROUTINE SGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK, @@ -147,7 +154,7 @@ LOGICAL ONENRM CHARACTER NORMIN INTEGER IX, KASE, KASE1 - REAL AINVNM, SCALE, SL, SMLNUM, SU + REAL AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL * .. * .. Local Arrays .. INTEGER ISAVE( 3 ) @@ -165,6 +172,8 @@ INTRINSIC ABS, MAX * .. * .. Executable Statements .. +* + HUGEVAL = SLAMCH( 'Overflow' ) * * Test the input parameters. * @@ -176,7 +185,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN + ELSE IF( ANORM.LT.ZERO ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN @@ -192,6 +201,13 @@ RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN + ELSE IF( SISNAN( ANORM ) ) THEN + RCOND = ANORM + INFO = -5 + RETURN + ELSE IF( ANORM.GT.HUGEVAL ) THEN + INFO = -5 + RETURN END IF * SMLNUM = SLAMCH( 'Safe minimum' ) @@ -248,8 +264,17 @@ * * Compute the estimate of the reciprocal condition number. * - IF( AINVNM.NE.ZERO ) - $ RCOND = ( ONE / AINVNM ) / ANORM + IF( AINVNM.NE.ZERO ) THEN + RCOND = ( ONE / AINVNM ) / ANORM + ELSE + INFO = 1 + RETURN + END IF +* +* Check for NaNs and Infs +* + IF( SISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL ) + $ INFO = 1 * 20 CONTINUE RETURN diff --git a/lapack-netlib/SRC/sgedmd.c b/lapack-netlib/SRC/sgedmd.c new file mode 100644 index 000000000..c8f3a5964 --- /dev/null +++ b/lapack-netlib/SRC/sgedmd.c @@ -0,0 +1,1746 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0 */ +/* The state space dimension (the row dimension of X, Y). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshot pairs */ +/* (the number of columns of X and Y). */ +/* ..... */ +/* X (input/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, X contains the data snapshot matrix X. It is */ +/* assumed that the column norms of X are in the range of */ +/* the normalized floating point numbers. */ +/* < On exit, the leading K columns of X contain a POD basis, */ +/* i.e. the leading K left singular vectors of the input */ +/* data matrix X, U(:,1:K). All N columns of X contain all */ +/* left singular vectors of the input matrix X. */ +/* See the descriptions of K, Z and W. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= M */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (input/workspace/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, Y contains the data snapshot matrix Y */ +/* < On exit, */ +/* If JOBR == 'R', the leading K columns of Y contain */ +/* the residual vectors for the computed Ritz pairs. */ +/* See the description of RES. */ +/* If JOBR == 'N', Y contains the original input data, */ +/* scaled according to the value of JOBS. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= M */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the descriptions of TOL and K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the POD basis for the data snapshot */ +/* matrix X and the number of the computed Ritz pairs. */ +/* The value of K is determined according to the rule set */ +/* by the parameters NRNK and TOL. */ +/* See the descriptions of NRNK and TOL. */ +/* ..... */ +/* REIG (output) REAL(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of REIG contain */ +/* the real parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* See the descriptions of K, IMEIG, and Z. */ +/* ..... */ +/* IMEIG (output) REAL(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of IMEIG contain */ +/* the imaginary parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* The eigenvalues are determined as follows: */ +/* If IMEIG(i) == 0, then the corresponding eigenvalue is */ +/* real, LAMBDA(i) = REIG(i). */ +/* If IMEIG(i)>0, then the corresponding complex */ +/* conjugate pair of eigenvalues reads */ +/* LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) */ +/* LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */ +/* That is, complex conjugate pairs have consecutive */ +/* indices (i,i+1), with the positive imaginary part */ +/* listed first. */ +/* See the descriptions of K, REIG, and Z. */ +/* ..... */ +/* Z (workspace/output) REAL(KIND=WP) M-by-N array */ +/* If JOBZ =='V' then */ +/* Z contains real Ritz vectors as follows: */ +/* If IMEIG(i)=0, then Z(:,i) is an eigenvector of */ +/* the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */ +/* [Z(:,i) Z(:,i+1)] span an invariant subspace and */ +/* the Ritz values extracted from this subspace are */ +/* REIG(i) + sqrt(-1)*IMEIG(i) and */ +/* REIG(i) - sqrt(-1)*IMEIG(i). */ +/* The corresponding eigenvectors are */ +/* Z(:,i) + sqrt(-1)*Z(:,i+1) and */ +/* Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */ +/* || Z(:,i:i+1)||_F = 1. */ +/* If JOBZ == 'F', then the above descriptions hold for */ +/* the columns of X(:,1:K)*W(1:K,1:K), where the columns */ +/* of W(1:k,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */ +/* are similarly structured: If IMEIG(i) == 0 then */ +/* X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */ +/* then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */ +/* X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */ +/* are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */ +/* See the descriptions of REIG, IMEIG, X and W. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) N-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs. */ +/* If LAMBDA(i) is real, then */ +/* RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */ +/* If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */ +/* then */ +/* RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */ +/* where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */ +/* [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */ +/* It holds that */ +/* RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 */ +/* RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */ +/* where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) */ +/* ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) */ +/* See the description of REIG, IMEIG and Z. */ +/* ..... */ +/* B (output) REAL(KIND=WP) M-by-N array. */ +/* IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:M,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* If JOBF =='N', then B is not referenced. */ +/* See the descriptions of X, W, K. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= M */ +/* The leading dimension of the array B. */ +/* ..... */ +/* W (workspace/output) REAL(KIND=WP) N-by-N array */ +/* On exit, W(1:K,1:K) contains the K computed */ +/* eigenvectors of the matrix Rayleigh quotient (real and */ +/* imaginary parts for each complex conjugate pair of the */ +/* eigenvalues). The Ritz vectors (returned in Z) are the */ +/* product of X (containing a POD basis for the input */ +/* matrix X) and W. See the descriptions of K, S, X and Z. */ +/* W is also used as a workspace to temporarily store the */ +/* left singular vectors of X. */ +/* ..... */ +/* LDW (input) INTEGER, LDW >= N */ +/* The leading dimension of the array W. */ +/* ..... */ +/* S (workspace/output) REAL(KIND=WP) N-by-N array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by SGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N */ +/* The leading dimension of the array S. */ +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, WORK(1:N) contains the singular values of */ +/* X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */ +/* If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */ +/* scaling factor WORK(N+2)/WORK(N+1) used to scale X */ +/* and Y to avoid overflow in the SVD of X. */ +/* This may be of interest if the scaling option is off */ +/* and as many as possible smallest eigenvalues are */ +/* desired to the highest feasible accuracy. */ +/* If the call to SGEDMD is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. Hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is calculated as follows: */ +/* If WHTSVD == 1 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */ +/* If JOBZ == 'N' then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */ +/* Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */ +/* workspace length of SGESVD. */ +/* If WHTSVD == 2 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */ +/* Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */ +/* minimal workspace length of SGESDD. */ +/* If WHTSVD == 3 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */ +/* Here LWORK_SVD = N+M+MAX(3*N+1, */ +/* MAX(1,3*N+M,5*N),MAX(1,N)) */ +/* is the minimal workspace length of SGESVDQ. */ +/* If WHTSVD == 4 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */ +/* Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */ +/* minimal workspace length of SGEJSV. */ +/* The above expressions are not simplified in order to */ +/* make the usage of WORK more transparent, and for */ +/* easier checking. In any case, LWORK >= 2. */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ +/* Local arrays */ +/* ~~~~~~~~~~~~ */ +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* ............................................................ */ + /* Parameter adjustments */ + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --reig; + --imeig; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + w_dim1 = *ldw; + w_offset = 1 + w_dim1 * 1; + w -= w_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; + +/* Test the input arguments */ + + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || lsame_(jobz, "N") || lsame_( + jobz, "F"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && ! wntvec) { + *info = -3; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -4; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -5; + } else if (*m < 0) { + *info = -6; + } else if (*n < 0 || *n > *m) { + *info = -7; + } else if (*ldx < *m) { + *info = -9; + } else if (*ldy < *m) { + *info = -11; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -12; + } else if (*tol < zero || *tol >= one) { + *info = -13; + } else if (*ldz < *m) { + *info = -18; + } else if ((wntref || wntex) && *ldb < *m) { + *info = -21; + } else if (*ldw < *n) { + *info = -23; + } else if (*lds < *n) { + *info = -25; + } + + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0) { +/* Quick return. All output except K is void. */ +/* INFO=1 signals the void input. */ +/* In case of a workspace query, the default */ +/* minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.f; + work[2] = 2.f; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlwork = f2cmax(2,*n); + olwork = f2cmax(2,*n); + iminwr = 1; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of SGESVD: */ +/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */ +/* Computing MAX */ + i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1, + i__2), i__2 = f2cmin(*m,*n) * 5; + mwrsvd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvd; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[ + b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, & + info1); +/* Computing MAX */ + i__1 = mwrsvd, i__2 = (integer) rdummy[0]; + lwrsvd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsvd; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 2) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of SGESDD: */ +/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */ +/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */ +/* IMINWR = 8*MIN(M,N) */ +/* Computing MAX */ + i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,* + n) << 2); + mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsdd; + mlwork = f2cmax(i__1,i__2); + iminwr = f2cmin(*m,*n) << 3; + if (lquery) { + sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], + ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], & + info1); +/* Computing MAX */ + i__1 = mwrsdd, i__2 = (integer) rdummy[0]; + lwrsdd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsdd; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 3) { +/* LWQP3 = 3*N+1 */ +/* LWORQ = MAX(N, 1) */ +/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */ +/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2) */ +/* MLWORK = N + MWRSVQ */ +/* IMINWR = M+N-1 */ + sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[ + 1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, & + iwork[1], &c_n1, rdummy, &c_n1, rdummy2, &c_n1, &info1); + iminwr = iwork[1]; + mwrsvq = (integer) rdummy[1]; +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0]; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + lwrsvq = (integer) rdummy[0]; +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0]; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 4) { + *(unsigned char *)jsvopt = 'J'; +/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V' */ +/* Computing MAX */ + i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (* + n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1) + + *n * *n + 6; + mwrsvj = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvj; + mlwork = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 3, i__2 = *m + *n * 3; + iminwr = f2cmax(i__1,i__2); + if (lquery) { +/* Computing MAX */ + i__1 = olwork, i__2 = *n + mwrsvj; + olwork = f2cmax(i__1,i__2); + } + } +/* END SELECT */ + if (wntvec || wntex || lsame_(jobz, "F")) { + *(unsigned char *)jobzl = 'V'; + } else { + *(unsigned char *)jobzl = 'N'; + } +/* Workspace calculation to the SGEEV call */ + if (lsame_(jobzl, "V")) { +/* Computing MAX */ + i__1 = 1, i__2 = *n << 2; + mwrkev = f2cmax(i__1,i__2); + } else { +/* Computing MAX */ + i__1 = 1, i__2 = *n * 3; + mwrkev = f2cmax(i__1,i__2); + } +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrkev; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + sgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[ + w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1); +/* Computing MAX */ + i__1 = mwrkev, i__2 = (integer) rdummy[0]; + lwrkev = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrkev; + olwork = f2cmax(i__1,i__2); + } + + if (*liwork < iminwr && ! lquery) { + *info = -29; + } + if (*lwork < mlwork && ! lquery) { + *info = -27; + } + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGEDMD", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + work[1] = (real) mlwork; + work[2] = (real) olwork; + return 0; + } +/* ............................................................ */ + + ofl = slamch_("O"); + small = slamch_("S"); + badxy = FALSE_; + +/* <1> Optional scaling of the snapshots (columns of X, Y) */ +/* ========================================================== */ + if (sccolx) { +/* The columns of X will be normalized. */ +/* To prevent overflows, the column norms of X are */ +/* carefully computed using SLASSQ. */ + *k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DNRM2( M, X(1,i), 1 ) */ + scale = zero; + slassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum); + if (sisnan_(&scale) || sisnan_(&ssum)) { + *k = 0; + *info = -8; + i__2 = -(*info); + xerbla_("SGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of X(:,i) overflows. First, X(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */ +/* Next, the norm of X(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of X(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + r__1 = one / rootsc; + slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[ + i__ * x_dim1 + 1], m, &info2); + work[i__] = -scale * (rootsc / (real) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + work[i__] = scale * rootsc; + slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[ + i__ * x_dim1 + 1], m, &info2); +/* X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC */ +/* LAPACK */ + } + } else { + work[i__] = zero; + ++(*k); + } + } + if (*k == *n) { +/* All columns of X are zero. Return error code -8. */ +/* (the 8th input variable had an illegal value) */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("SGEDMD", &i__1); + return 0; + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of Y. */ + if (work[i__] > zero) { + r__1 = one / work[i__]; + sscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1); +/* Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (work[i__] < zero) { + r__1 = -work[i__]; + r__2 = one / (real) (*m); + slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * + y_dim1 + 1], m, &info2); +/* LAPACK CA */ + } else if (y[isamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * + y_dim1] != zero) { +/* X(:,i) is zero vector. For consistency, */ +/* Y(:,i) should also be zero. If Y(:,i) is not */ +/* zero, then the data might be inconsistent or */ +/* corrupted. If JOBS == 'C', Y(:,i) is set to */ +/* zero and a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + if (lsame_(jobs, "C")) { + sscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1); + } +/* BLAS CALL */ + } + } + } + + if (sccoly) { +/* The columns of Y will be normalized. */ +/* To prevent overflows, the column norms of Y are */ +/* carefully computed using SLASSQ. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */ + scale = zero; + slassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum); + if (sisnan_(&scale) || sisnan_(&ssum)) { + *k = 0; + *info = -10; + i__2 = -(*info); + xerbla_("SGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of Y(:,i) overflows. First, Y(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */ +/* Next, the norm of Y(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of Y(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + r__1 = one / rootsc; + slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[ + i__ * y_dim1 + 1], m, &info2); + work[i__] = -scale * (rootsc / (real) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + work[i__] = scale * rootsc; + slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[ + i__ * y_dim1 + 1], m, &info2); +/* Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* LAPACK */ + } + } else { + work[i__] = zero; + } + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of X. */ + if (work[i__] > zero) { + r__1 = one / work[i__]; + sscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1); +/* X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (work[i__] < zero) { + r__1 = -work[i__]; + r__2 = one / (real) (*m); + slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * + x_dim1 + 1], m, &info2); +/* LAPACK CA */ + } else if (x[isamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * + x_dim1] != zero) { +/* Y(:,i) is zero vector. If X(:,i) is not */ +/* zero, then a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + } + } + } + +/* <2> SVD of the data snapshot matrix X. */ +/* ===================================== */ +/* The left singular vectors are stored in the array X. */ +/* The right singular vectors are in the array W. */ +/* The array W will later on contain the eigenvectors */ +/* of a Rayleigh quotient. */ + numrnk = *n; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { + i__1 = *lwork - *n; + sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], + ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1); +/* LAPACK CAL */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 2) { + i__1 = *lwork - *n; + sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[ + w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1); +/* LAPACK CAL */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 3) { + i__1 = *lwork - *n - f2cmax(2,*m); + i__2 = f2cmax(2,*m); + sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], & + z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], + liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], & + i__2, &info1); + + slacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK C */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 4) { + i__1 = *lwork - *n; + sgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, & + work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1] + , &i__1, &iwork[1], &info1); +/* LAPACK CALL */ + slacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK CALL */ + *(unsigned char *)t_or_n__ = 'N'; + xscl1 = work[*n + 1]; + xscl2 = work[*n + 2]; + if (xscl1 != xscl2) { +/* This is an exceptional situation. If the */ +/* data matrices are not scaled and the */ +/* largest singular value of X overflows. */ +/* In that case SGEJSV can return the SVD */ +/* in scaled form. The scaling factor can be used */ +/* to rescale the data (X and Y). */ + slascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], + ldy, &info2); + } +/* END SELECT */ + } + + if (info1 > 0) { +/* The SVD selected subroutine did not converge. */ +/* Return with an error code. */ + *info = 2; + return 0; + } + + if (work[1] == zero) { +/* The largest computed singular value of (scaled) */ +/* X is zero. Return error code -8 */ +/* (the 8th input variable had an illegal value). */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("SGEDMD", &i__1); + return 0; + } + +/* <3> Determine the numerical rank of the data */ +/* snapshots matrix X. This depends on the */ +/* parameters NRNK and TOL. */ +/* SELECT CASE ( NRNK ) */ + if (*nrnk == -1) { + *k = 1; + i__1 = numrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (work[i__] <= work[1] * *tol || work[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else if (*nrnk == -2) { + *k = 1; + i__1 = numrnk - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else { + *k = 1; + i__1 = *nrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (work[i__] <= small) { + myexit_(); + } + ++(*k); + } +/* END SELECT */ + } +/* Now, U = X(1:M,1:K) is the SVD/POD basis for the */ +/* snapshot data in the input matrix X. */ +/* <4> Compute the Rayleigh quotient S = U^T * A * U. */ +/* Depending on the requested outputs, the computation */ +/* is organized to compute additional auxiliary */ +/* matrices (for the residuals and refinements). */ + +/* In all formulas below, we need V_k*Sigma_k^(-1) */ +/* where either V_k is in W(1:N,1:K), or V_k^T is in */ +/* W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */ + if (lsame_(t_or_n__, "N")) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + r__1 = one / work[i__]; + sscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1); +/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i) ! INTRINSIC */ +/* BLAS CALL */ + } + } else { +/* This non-unit stride access is due to the fact */ +/* that SGESVD, SGESVDQ and SGESDD return the */ +/* transposed matrix of the right singular vectors. */ +/* DO i = 1, K */ +/* CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW ) ! BLAS CALL */ +/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N) ! INTRINSIC */ +/* END DO */ + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + work[*n + i__] = one / work[i__]; + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *k; + for (i__ = 1; i__ <= i__2; ++i__) { + w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1]; + } + } + } + + if (wntref) { + +/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */ +/* for computing the refined Ritz vectors */ +/* (optionally, outside SGEDMD). */ + sgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], + ldw, &zero, &z__[z_offset], ldz); +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRI */ +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRI */ + +/* At this point Z contains */ +/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */ +/* this is needed for computing the residuals. */ +/* This matrix is returned in the array B and */ +/* it can be used to compute refined Ritz vectors. */ +/* BLAS */ + slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); +/* B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC */ +/* BLAS CALL */ + sgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], + ldz, &zero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */ +/* At this point S = U^T * A * U is the Rayleigh quotient. */ +/* BLAS */ + } else { +/* A * U(:,1:K) is not explicitly needed and the */ +/* computation is organized differently. The Rayleigh */ +/* quotient is computed more efficiently. */ + sgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, + &zero, &z__[z_offset], ldz); +/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! IN */ +/* In the two SGEMM calls here, can use K for LDZ */ +/* B */ + sgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset] + , ldw, &zero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */ +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRIN */ +/* At this point S = U^T * A * U is the Rayleigh quotient. */ +/* If the residuals are requested, save scaled V_k into Z. */ +/* Recall that V_k or V_k^T is stored in W. */ +/* BLAS */ + if (wntres || wntex) { + if (lsame_(t_or_n__, "N")) { + slacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz); + } else { + slacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz); + } + } + } + +/* <5> Compute the Ritz values and (if requested) the */ +/* right eigenvectors of the Rayleigh quotient. */ + + i__1 = *lwork - *n; + sgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset] + , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1); + +/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */ +/* quotient. Even in the case of complex spectrum, all */ +/* computation is done in real arithmetic. REIG and */ +/* IMEIG are the real and the imaginary parts of the */ +/* eigenvalues, so that the spectrum is given as */ +/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */ +/* are listed at consecutive positions. For such a */ +/* complex conjugate pair of the eigenvalues, the */ +/* corresponding eigenvectors are also a complex */ +/* conjugate pair with the real and imaginary parts */ +/* stored column-wise in W at the corresponding */ +/* consecutive column indices. See the description of Z. */ +/* Also, see the description of SGEEV. */ +/* LAPACK C */ + if (info1 > 0) { +/* SGEEV failed to compute the eigenvalues and */ +/* eigenvectors of the Rayleigh quotient. */ + *info = 3; + return 0; + } + +/* <6> Compute the eigenvectors (if requested) and, */ +/* the residuals (if requested). */ + + if (wntvec || wntex) { + if (wntres) { + if (wntref) { +/* Here, if the refinement is requested, we have */ +/* A*U(:,1:K) already computed and stored in Z. */ +/* For the residuals, need Y = A * U(:,1;K) * W. */ + sgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &y[y_offset], ldy); +/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC */ +/* This frees Z; Y contains A * U(:,1:K) * W. */ +/* BLAS CALL */ + } else { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[ + s_offset], lds, &zero, &z__[z_offset], ldz); +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ + slacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy); + if (wntex) { + slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); + } + } + } else if (wntex) { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], + lds, &zero, &b[b_offset], ldb); +/* The above call replaces the following two calls */ +/* that were used in the developing-testing phase. */ +/* CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */ +/* LDS, ZERO, Z, LDZ) */ +/* Save a copy of Z into B and free Z for holding */ +/* the Ritz vectors. */ +/* CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) */ + } + +/* Compute the real form of the Ritz vectors */ + if (wntvec) { + sgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], + ldw, &zero, &z__[z_offset], ldz); + } +/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC */ + +/* BLAS CALL */ + if (wntres) { + i__ = 1; + while(i__ <= *k) { + if (imeig[i__] == zero) { +/* have a real eigenvalue with real eigenvector */ + r__1 = -reig[i__]; + saxpy_(m, &r__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * + y_dim1 + 1], &c__1); +/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i) ! */ + + res[i__] = snrm2_(m, &y[i__ * y_dim1 + 1], &c__1); + ++i__; + } else { +/* Have a complex conjugate pair */ +/* REIG(i) +- sqrt(-1)*IMEIG(i). */ +/* Since all computation is done in real */ +/* arithmetic, the formula for the residual */ +/* is recast for real representation of the */ +/* complex conjugate eigenpair. See the */ +/* description of RES. */ + ab[0] = reig[i__]; + ab[1] = -imeig[i__]; + ab[2] = imeig[i__]; + ab[3] = reig[i__]; + r__1 = -one; + sgemm_("N", "N", m, &c__2, &c__2, &r__1, &z__[i__ * + z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * + y_dim1 + 1], ldy); +/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INT */ +/* BL */ + res[i__] = slange_("F", m, &c__2, &y[i__ * y_dim1 + 1], + ldy, &work[*n + 1]); +/* LA */ + res[i__ + 1] = res[i__]; + i__ += 2; + } + } + } + } + + if (*whtsvd == 4) { + work[*n + 1] = xscl1; + work[*n + 2] = xscl2; + } + +/* Successful exit. */ + if (! badxy) { + *info = 0; + } else { +/* A warning on possible data inconsistency. */ +/* This should be a rare event. */ + *info = 4; + } +/* ............................................................ */ + return 0; +/* ...... */ +} /* sgedmd_ */ + diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90 new file mode 100644 index 000000000..4860e8898 --- /dev/null +++ b/lapack-netlib/SRC/sgedmd.f90 @@ -0,0 +1,1206 @@ +!> \brief \b SGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) +!..... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 +!..... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> SGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, SGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, SGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Distribution Statement A: +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!============================================================ +! Arguments +! ========= +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim +!..... +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim +!..... +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim +!..... +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim +!..... +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: SGESVD (the QR SVD algorithm) +!> 2 :: SGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim +!..... +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim +!..... +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim +!..... +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim +!..... +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim +!..... +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim +!..... +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim +!..... +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim +!..... +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim +!..... +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim +!..... +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim +!..... +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim +!..... +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim +!..... +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim +!..... +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim +!..... +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim +!..... +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim +!..... +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> left singular vectors of X. +!> \endverbatim +!..... +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim +!..... +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by SGEEV. +!> See the description of K. +!> \endverbatim +!..... +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim +!..... +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to SGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> length of work is at least 2. +!> See the description of LWORK. +!> \endverbatim +!..... +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of SGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of SGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of SGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of SGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim +!..... +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim +!..... +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim +!..... +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! +!............................................................. +!............................................................. + SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! +! Local scalars +! ~~~~~~~~~~~~~ + REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & + SSUM, XSCL1, XSCL2 + INTEGER :: i, j, IMINWR, INFO1, INFO2, & + LWRKEV, LWRSDD, LWRSVD, & + LWRSVQ, MLWORK, MWRKEV, MWRSDD, & + MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, & + OLWORK + LOGICAL :: BADXY, LQUERY, SCCOLX, SCCOLY, & + WNTEX, WNTREF, WNTRES, WNTVEC + CHARACTER :: JOBZL, T_OR_N + CHARACTER :: JSVOPT +! +! Local arrays +! ~~~~~~~~~~~~ + REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + REAL(KIND=WP) SLANGE, SLAMCH, SNRM2 + EXTERNAL SLANGE, SLAMCH, SNRM2, ISAMAX + INTEGER ISAMAX + LOGICAL SISNAN, LSAME + EXTERNAL SISNAN, LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL SAXPY, SGEMM, SSCAL + EXTERNAL SGEEV, SGEJSV, SGESDD, SGESVD, SGESVDQ, & + SLACPY, SLASCL, SLASSQ, XERBLA +! +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC INT, FLOAT, MAX, SQRT +!............................................................ +! +! Test the input arguments +! + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C') + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N') & + .OR. LSAME(JOBZ,'F')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. (.NOT.WNTVEC) ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -4 + ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR. & + (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN + INFO = -5 + ELSE IF ( M < 0 ) THEN + INFO = -6 + ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN + INFO = -7 + ELSE IF ( LDX < M ) THEN + INFO = -9 + ELSE IF ( LDY < M ) THEN + INFO = -11 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -12 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -13 + ELSE IF ( LDZ < M ) THEN + INFO = -18 + ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN + INFO = -21 + ELSE IF ( LDW < N ) THEN + INFO = -23 + ELSE IF ( LDS < N ) THEN + INFO = -25 + END IF +! + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( N == 0 ) THEN + ! Quick return. All output except K is void. + ! INFO=1 signals the void input. + ! In case of a workspace query, the default + ! minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + MLWORK = MAX(2,N) + OLWORK = MAX(2,N) + IMINWR = 1 + SELECT CASE ( WHTSVD ) + CASE (1) + ! The following is specified as the minimal + ! length of WORK in the definition of SGESVD: + ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + MLWORK = MAX(MLWORK,N + MWRSVD) + IF ( LQUERY ) THEN + CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, & + B, LDB, W, LDW, RDUMMY, -1, INFO1 ) + LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) ) + OLWORK = MAX(OLWORK,N + LWRSVD) + END IF + CASE (2) + ! The following is specified as the minimal + ! length of WORK in the definition of SGESDD: + ! MWRSDD = 3*MIN(M,N)*MIN(M,N) + + ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) + ! IMINWR = 8*MIN(M,N) + MWRSDD = 3*MIN(M,N)*MIN(M,N) + & + MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) + MLWORK = MAX(MLWORK,N + MWRSDD) + IMINWR = 8*MIN(M,N) + IF ( LQUERY ) THEN + CALL SGESDD( 'O', M, N, X, LDX, WORK, B, & + LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 ) + LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) ) + OLWORK = MAX(OLWORK,N + LWRSDD) + END IF + CASE (3) + !LWQP3 = 3*N+1 + !LWORQ = MAX(N, 1) + !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2) + !MLWORK = N + MWRSVQ + !IMINWR = M+N-1 + CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, WORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, -1, RDUMMY, & + -1, RDUMMY2, -1, INFO1 ) + IMINWR = IWORK(1) + MWRSVQ = INT(RDUMMY(2)) + MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1))) + IF ( LQUERY ) THEN + LWRSVQ = INT(RDUMMY(1)) + OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1))) + END IF + CASE (4) + JSVOPT = 'J' + !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V' + MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 ) + MLWORK = MAX(MLWORK,N+MWRSVJ) + IMINWR = MAX( 3, M+3*N ) + IF ( LQUERY ) THEN + OLWORK = MAX(OLWORK,N+MWRSVJ) + END IF + END SELECT + IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN + JOBZL = 'V' + ELSE + JOBZL = 'N' + END IF + ! Workspace calculation to the SGEEV call + IF ( LSAME(JOBZL,'V') ) THEN + MWRKEV = MAX( 1, 4*N ) + ELSE + MWRKEV = MAX( 1, 3*N ) + END IF + MLWORK = MAX(MLWORK,N+MWRKEV) + IF ( LQUERY ) THEN + CALL SGEEV( 'N', JOBZL, N, S, LDS, REIG, & + IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 ) + LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) ) + OLWORK = MAX( OLWORK, N+LWRKEV ) + END IF +! + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29 + IF ( LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27 + END IF +! + IF( INFO /= 0 ) THEN + CALL XERBLA( 'SGEDMD', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + WORK(1) = MLWORK + WORK(2) = OLWORK + RETURN + END IF +!............................................................ +! + OFL = SLAMCH('O') + SMALL = SLAMCH('S') + BADXY = .FALSE. +! +! <1> Optional scaling of the snapshots (columns of X, Y) +! ========================================================== + IF ( SCCOLX ) THEN + ! The columns of X will be normalized. + ! To prevent overflows, the column norms of X are + ! carefully computed using SLASSQ. + K = 0 + DO i = 1, N + !WORK(i) = DNRM2( M, X(1,i), 1 ) + SSUM = ONE + SCALE = ZERO + CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM ) + IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN + K = 0 + INFO = -8 + CALL XERBLA('SGEDMD',-INFO) + END IF + IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of X(:,i) overflows. First, X(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. +! Next, the norm of X(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of X(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, X(1,i), M, INFO2 ) + WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + WORK(i) = SCALE * ROOTSC + CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, & + X(1,i), M, INFO2 ) ! LAPACK CALL +! X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC + END IF + ELSE + WORK(i) = ZERO + K = K + 1 + END IF + END DO + IF ( K == N ) THEN + ! All columns of X are zero. Return error code -8. + ! (the 8th input variable had an illegal value) + K = 0 + INFO = -8 + CALL XERBLA('SGEDMD',-INFO) + RETURN + END IF + DO i = 1, N +! Now, apply the same scaling to the columns of Y. + IF ( WORK(i) > ZERO ) THEN + CALL SSCAL( M, ONE/WORK(i), Y(1,i), 1 ) ! BLAS CALL +! Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC + ELSE IF ( WORK(i) < ZERO ) THEN + CALL SLASCL( 'G', 0, 0, -WORK(i), & + ONE/FLOAT(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL + ELSE IF ( Y(ISAMAX(M, Y(1,i),1),i ) & + /= ZERO ) THEN +! X(:,i) is zero vector. For consistency, +! Y(:,i) should also be zero. If Y(:,i) is not +! zero, then the data might be inconsistent or +! corrupted. If JOBS == 'C', Y(:,i) is set to +! zero and a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + IF ( LSAME(JOBS,'C')) & + CALL SSCAL( M, ZERO, Y(1,i), 1 ) ! BLAS CALL + END IF + END DO + END IF + ! + IF ( SCCOLY ) THEN + ! The columns of Y will be normalized. + ! To prevent overflows, the column norms of Y are + ! carefully computed using SLASSQ. + DO i = 1, N + !WORK(i) = DNRM2( M, Y(1,i), 1 ) + SSUM = ONE + SCALE = ZERO + CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM ) + IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN + K = 0 + INFO = -10 + CALL XERBLA('SGEDMD',-INFO) + END IF + IF ( SCALE /= ZERO .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of Y(:,i) overflows. First, Y(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. +! Next, the norm of Y(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of Y(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, Y(1,i), M, INFO2 ) + WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + WORK(i) = SCALE * ROOTSC + CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, & + Y(1,i), M, INFO2 ) ! LAPACK CALL +! Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC + END IF + ELSE + WORK(i) = ZERO + END IF + END DO + DO i = 1, N +! Now, apply the same scaling to the columns of X. + IF ( WORK(i) > ZERO ) THEN + CALL SSCAL( M, ONE/WORK(i), X(1,i), 1 ) ! BLAS CALL +! X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC + ELSE IF ( WORK(i) < ZERO ) THEN + CALL SLASCL( 'G', 0, 0, -WORK(i), & + ONE/FLOAT(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL + ELSE IF ( X(ISAMAX(M, X(1,i),1),i ) & + /= ZERO ) THEN +! Y(:,i) is zero vector. If X(:,i) is not +! zero, then a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + END IF + END DO + END IF +! +! <2> SVD of the data snapshot matrix X. +! ===================================== +! The left singular vectors are stored in the array X. +! The right singular vectors are in the array W. +! The array W will later on contain the eigenvectors +! of a Rayleigh quotient. + NUMRNK = N + SELECT CASE ( WHTSVD ) + CASE (1) + CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, B, & + LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL + T_OR_N = 'T' + CASE (2) + CALL SGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, & + LDW, WORK(N+1), LWORK-N, IWORK, INFO1 ) ! LAPACK CALL + T_OR_N = 'T' + CASE (3) + CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, WORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),& + LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1) ! LAPACK CALL + CALL SLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'T' + CASE (4) + CALL SGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, & + N, X, LDX, WORK, Z, LDZ, W, LDW, & + WORK(N+1), LWORK-N, IWORK, INFO1 ) ! LAPACK CALL + CALL SLACPY( 'A', M, N, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'N' + XSCL1 = WORK(N+1) + XSCL2 = WORK(N+2) + IF ( XSCL1 /= XSCL2 ) THEN + ! This is an exceptional situation. If the + ! data matrices are not scaled and the + ! largest singular value of X overflows. + ! In that case SGEJSV can return the SVD + ! in scaled form. The scaling factor can be used + ! to rescale the data (X and Y). + CALL SLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2 ) + END IF + END SELECT +! + IF ( INFO1 > 0 ) THEN + ! The SVD selected subroutine did not converge. + ! Return with an error code. + INFO = 2 + RETURN + END IF +! + IF ( WORK(1) == ZERO ) THEN + ! The largest computed singular value of (scaled) + ! X is zero. Return error code -8 + ! (the 8th input variable had an illegal value). + K = 0 + INFO = -8 + CALL XERBLA('SGEDMD',-INFO) + RETURN + END IF +! + !<3> Determine the numerical rank of the data + ! snapshots matrix X. This depends on the + ! parameters NRNK and TOL. + + SELECT CASE ( NRNK ) + CASE ( -1 ) + K = 1 + DO i = 2, NUMRNK + IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. & + ( WORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE ( -2 ) + K = 1 + DO i = 1, NUMRNK-1 + IF ( ( WORK(i+1) <= WORK(i)*TOL ) .OR. & + ( WORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE DEFAULT + K = 1 + DO i = 2, NRNK + IF ( WORK(i) <= SMALL ) EXIT + K = K + 1 + END DO + END SELECT + ! Now, U = X(1:M,1:K) is the SVD/POD basis for the + ! snapshot data in the input matrix X. + + !<4> Compute the Rayleigh quotient S = U^T * A * U. + ! Depending on the requested outputs, the computation + ! is organized to compute additional auxiliary + ! matrices (for the residuals and refinements). + ! + ! In all formulas below, we need V_k*Sigma_k^(-1) + ! where either V_k is in W(1:N,1:K), or V_k^T is in + ! W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). + IF ( LSAME(T_OR_N, 'N') ) THEN + DO i = 1, K + CALL SSCAL( N, ONE/WORK(i), W(1,i), 1 ) ! BLAS CALL + ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i) ! INTRINSIC + END DO + ELSE + ! This non-unit stride access is due to the fact + ! that SGESVD, SGESVDQ and SGESDD return the + ! transposed matrix of the right singular vectors. + !DO i = 1, K + ! CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW ) ! BLAS CALL + ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N) ! INTRINSIC + !END DO + DO i = 1, K + WORK(N+i) = ONE/WORK(i) + END DO + DO j = 1, N + DO i = 1, K + W(i,j) = (WORK(N+i))*W(i,j) + END DO + END DO + END IF +! + IF ( WNTREF ) THEN + ! + ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) + ! for computing the refined Ritz vectors + ! (optionally, outside SGEDMD). + CALL SGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, & + LDW, ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRINSIC, for T_OR_N=='N' + ! + ! At this point Z contains + ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and + ! this is needed for computing the residuals. + ! This matrix is returned in the array B and + ! it can be used to compute refined Ritz vectors. + CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) ! BLAS CALL + ! B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC + + CALL SGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, & + LDZ, ZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC + ! At this point S = U^T * A * U is the Rayleigh quotient. + ELSE + ! A * U(:,1:K) is not explicitly needed and the + ! computation is organized differently. The Rayleigh + ! quotient is computed more efficiently. + CALL SGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, & + ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! INTRINSIC + ! In the two SGEMM calls here, can use K for LDZ + CALL SGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, & + LDW, ZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRINSIC, for T_OR_N=='N' + ! At this point S = U^T * A * U is the Rayleigh quotient. + ! If the residuals are requested, save scaled V_k into Z. + ! Recall that V_k or V_k^T is stored in W. + IF ( WNTRES .OR. WNTEX ) THEN + IF ( LSAME(T_OR_N, 'N') ) THEN + CALL SLACPY( 'A', N, K, W, LDW, Z, LDZ ) + ELSE + CALL SLACPY( 'A', K, N, W, LDW, Z, LDZ ) + END IF + END IF + END IF +! + !<5> Compute the Ritz values and (if requested) the + ! right eigenvectors of the Rayleigh quotient. + ! + CALL SGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, & + LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL + ! + ! W(1:K,1:K) contains the eigenvectors of the Rayleigh + ! quotient. Even in the case of complex spectrum, all + ! computation is done in real arithmetic. REIG and + ! IMEIG are the real and the imaginary parts of the + ! eigenvalues, so that the spectrum is given as + ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs + ! are listed at consecutive positions. For such a + ! complex conjugate pair of the eigenvalues, the + ! corresponding eigenvectors are also a complex + ! conjugate pair with the real and imaginary parts + ! stored column-wise in W at the corresponding + ! consecutive column indices. See the description of Z. + ! Also, see the description of SGEEV. + IF ( INFO1 > 0 ) THEN + ! SGEEV failed to compute the eigenvalues and + ! eigenvectors of the Rayleigh quotient. + INFO = 3 + RETURN + END IF +! + ! <6> Compute the eigenvectors (if requested) and, + ! the residuals (if requested). + ! + IF ( WNTVEC .OR. WNTEX ) THEN + IF ( WNTRES ) THEN + IF ( WNTREF ) THEN + ! Here, if the refinement is requested, we have + ! A*U(:,1:K) already computed and stored in Z. + ! For the residuals, need Y = A * U(:,1;K) * W. + CALL SGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, & + LDW, ZERO, Y, LDY ) ! BLAS CALL + ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC + ! This frees Z; Y contains A * U(:,1:K) * W. + ELSE + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, & + W, LDW, ZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + LDS, ZERO, Z, LDZ ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + CALL SLACPY( 'A', M, K, Z, LDZ, Y, LDY ) + IF ( WNTEX ) CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF + ELSE IF ( WNTEX ) THEN + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, & + W, LDW, ZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + LDS, ZERO, B, LDB ) + ! The above call replaces the following two calls + ! that were used in the developing-testing phase. + ! CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + ! LDS, ZERO, Z, LDZ) + ! Save a copy of Z into B and free Z for holding + ! the Ritz vectors. + ! CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF +! + ! Compute the real form of the Ritz vectors + IF ( WNTVEC ) CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC +! + IF ( WNTRES ) THEN + i = 1 + DO WHILE ( i <= K ) + IF ( IMEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 ) ! BLAS CALL + ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i) ! INTRINSIC + RES(i) = SNRM2( M, Y(1,i), 1 ) ! BLAS CALL + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IMEIG(i) + AB(1,2) = IMEIG(i) + AB(2,2) = REIG(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + LDZ, AB, 2, ONE, Y(1,i), LDY ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES(i) = SLANGE( 'F', M, 2, Y(1,i), LDY, & + WORK(N+1) ) ! LAPACK CALL + RES(i+1) = RES(i) + i = i + 2 + END IF + END DO + END IF + END IF +! + IF ( WHTSVD == 4 ) THEN + WORK(N+1) = XSCL1 + WORK(N+2) = XSCL2 + END IF +! +! Successful exit. + IF ( .NOT. BADXY ) THEN + INFO = 0 + ELSE + ! A warning on possible data inconsistency. + ! This should be a rare event. + INFO = 4 + END IF +!............................................................ + RETURN +! ...... + END SUBROUTINE SGEDMD + diff --git a/lapack-netlib/SRC/sgedmdq.c b/lapack-netlib/SRC/sgedmdq.c new file mode 100644 index 000000000..0adf3bda3 --- /dev/null +++ b/lapack-netlib/SRC/sgedmdq.c @@ -0,0 +1,1296 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0 */ +/* The state space dimension (the number of rows of F) */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshots from a single trajectory, */ +/* taken at equidistant discrete times. This is the */ +/* number of columns of F. */ +/* ..... */ +/* F (input/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, */ +/* the columns of F are the sequence of data snapshots */ +/* from a single trajectory, taken at equidistant discrete */ +/* times. It is assumed that the column norms of F are */ +/* in the range of the normalized floating point numbers. */ +/* < On exit, */ +/* If JOBQ == 'Q', the array F contains the orthogonal */ +/* matrix/factor of the QR factorization of the initial */ +/* data snapshots matrix F. See the description of JOBQ. */ +/* If JOBQ == 'N', the entries in F strictly below the main */ +/* diagonal contain, column-wise, the information on the */ +/* Householder vectors, as returned by SGEQRF. The */ +/* remaining information to restore the orthogonal matrix */ +/* of the initial QR factorization is stored in WORK(1:N). */ +/* See the description of WORK. */ +/* ..... */ +/* LDF (input) INTEGER, LDF >= M */ +/* The leading dimension of the array F. */ +/* ..... */ +/* X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* X is used as workspace to hold representations of the */ +/* leading N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, the leading K columns of X contain the leading */ +/* K left singular vectors of the above described content */ +/* of X. To lift them to the space of the left singular */ +/* vectors U(:,1:K)of the input data, pre-multiply with the */ +/* Q factor from the initial QR factorization. */ +/* See the descriptions of F, K, V and Z. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= N */ +/* The leading dimension of the array X */ +/* ..... */ +/* Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* Y is used as workspace to hold representations of the */ +/* trailing N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, */ +/* If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */ +/* triangular factor from the QR factorization of the data */ +/* snapshot matrix F. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= N */ +/* The leading dimension of the array Y */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N-1 :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the description of K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the SVD/POD basis for the leading N-1 */ +/* data snapshots (columns of F) and the number of the */ +/* computed Ritz pairs. The value of K is determined */ +/* according to the rule set by the parameters NRNK and */ +/* TOL. See the descriptions of NRNK and TOL. */ +/* ..... */ +/* REIG (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K<=N) entries of REIG contain */ +/* the real parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* See the descriptions of K, IMEIG, Z. */ +/* ..... */ +/* IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K0, then the corresponding complex */ +/* conjugate pair of eigenvalues reads */ +/* LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) */ +/* LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */ +/* That is, complex conjugate pairs have consecutive */ +/* indices (i,i+1), with the positive imaginary part */ +/* listed first. */ +/* See the descriptions of K, REIG, Z. */ +/* ..... */ +/* Z (workspace/output) REAL(KIND=WP) M-by-(N-1) array */ +/* If JOBZ =='V' then */ +/* Z contains real Ritz vectors as follows: */ +/* If IMEIG(i)=0, then Z(:,i) is an eigenvector of */ +/* the i-th Ritz value. */ +/* If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */ +/* [Z(:,i) Z(:,i+1)] span an invariant subspace and */ +/* the Ritz values extracted from this subspace are */ +/* REIG(i) + sqrt(-1)*IMEIG(i) and */ +/* REIG(i) - sqrt(-1)*IMEIG(i). */ +/* The corresponding eigenvectors are */ +/* Z(:,i) + sqrt(-1)*Z(:,i+1) and */ +/* Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */ +/* If JOBZ == 'F', then the above descriptions hold for */ +/* the columns of Z*V, where the columns of V are the */ +/* eigenvectors of the K-by-K Rayleigh quotient, and Z is */ +/* orthonormal. The columns of V are similarly structured: */ +/* If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */ +/* IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */ +/* Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */ +/* are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */ +/* See the descriptions of REIG, IMEIG, X and V. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs. */ +/* If LAMBDA(i) is real, then */ +/* RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */ +/* If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */ +/* then */ +/* RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */ +/* where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */ +/* [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */ +/* It holds that */ +/* RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 */ +/* RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */ +/* where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) */ +/* ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) */ +/* See the description of Z. */ +/* ..... */ +/* B (output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array. */ +/* IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:N,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* In both cases, the content of B can be lifted to the */ +/* original dimension of the input data by pre-multiplying */ +/* with the Q factor from the initial QR factorization. */ +/* Here A denotes a compression of the underlying operator. */ +/* See the descriptions of F and X. */ +/* If JOBF =='N', then B is not referenced. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= MIN(M,N) */ +/* The leading dimension of the array B. */ +/* ..... */ +/* V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */ +/* On exit, V(1:K,1:K) contains the K eigenvectors of */ +/* the Rayleigh quotient. The eigenvectors of a complex */ +/* conjugate pair of eigenvalues are returned in real form */ +/* as explained in the description of Z. The Ritz vectors */ +/* (returned in Z) are the product of X and V; see */ +/* the descriptions of X and Z. */ +/* ..... */ +/* LDV (input) INTEGER, LDV >= N-1 */ +/* The leading dimension of the array V. */ +/* ..... */ +/* S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by SGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N-1 */ +/* The leading dimension of the array S. */ +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* WORK(1:MIN(M,N)) contains the scalar factors of the */ +/* elementary reflectors as returned by SGEQRF of the */ +/* M-by-N input matrix F. */ +/* WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */ +/* the input submatrix F(1:M,1:N-1). */ +/* If the call to SGEDMDQ is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. Hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is calculated as follows: */ +/* Let MLWQR = N (minimal workspace for SGEQRF[M,N]) */ +/* MLWDMD = minimal workspace for SGEDMD (see the */ +/* description of LWORK in SGEDMD) for */ +/* snapshots of dimensions MIN(M,N)-by-(N-1) */ +/* MLWMQR = N (minimal workspace for */ +/* SORMQR['L','N',M,N,N]) */ +/* MLWGQR = N (minimal workspace for SORGQR[M,N,N]) */ +/* Then */ +/* LWORK = MAX(N+MLWQR, N+MLWDMD) */ +/* is updated as follows: */ +/* if JOBZ == 'V' or JOBZ == 'F' THEN */ +/* LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR ) */ +/* if JOBQ == 'Q' THEN */ +/* LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR) */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* Let M1=MIN(M,N), N1=N-1. Then */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */ +/* If on entry LIWORK = -1, then a worskpace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ + +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* Local array */ +/* ~~~~~~~~~~~ */ + +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ + +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* External subroutines */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ + /* Parameter adjustments */ + f_dim1 = *ldf; + f_offset = 1 + f_dim1 * 1; + f -= f_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --reig; + --imeig; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + v_dim1 = *ldv; + v_offset = 1 + v_dim1 * 1; + v -= v_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; +/* .......................................................... */ + +/* Test the input arguments */ + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntvcf = lsame_(jobz, "F"); + wntvcq = lsame_(jobz, "Q"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + wantq = lsame_(jobq, "Q"); + wnttrf = lsame_(jobt, "R"); + minmn = f2cmin(*m,*n); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && lsame_(jobz, "N")) { + *info = -3; + } else if (! (wantq || lsame_(jobq, "N"))) { + *info = -4; + } else if (! (wnttrf || lsame_(jobt, "N"))) { + *info = -5; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -6; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -7; + } else if (*m < 0) { + *info = -8; + } else if (*n < 0 || *n > *m + 1) { + *info = -9; + } else if (*ldf < *m) { + *info = -11; + } else if (*ldx < minmn) { + *info = -13; + } else if (*ldy < minmn) { + *info = -15; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -16; + } else if (*tol < zero || *tol >= one) { + *info = -17; + } else if (*ldz < *m) { + *info = -22; + } else if ((wntref || wntex) && *ldb < minmn) { + *info = -25; + } else if (*ldv < *n - 1) { + *info = -27; + } else if (*lds < *n - 1) { + *info = -29; + } + + if (wntvec || wntvcf) { + *(unsigned char *)jobvl = 'V'; + } else { + *(unsigned char *)jobvl = 'N'; + } + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0 || *n == 1) { +/* All output except K is void. INFO=1 signals */ +/* the void input. In case of a workspace query, */ +/* the minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.f; + work[2] = 2.f; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlwqr = f2cmax(1,*n); +/* Minimal workspace length for SGEQRF. */ + mlwork = f2cmin(*m,*n) + mlwqr; + if (lquery) { + sgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1); + olwqr = (integer) rdummy[0]; + olwork = f2cmin(*m,*n) + olwqr; + } + i__1 = *n - 1; + sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], + ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], & + z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], + ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, & + info1); + mlwdmd = (integer) work[1]; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwdmd; + mlwork = f2cmax(i__1,i__2); + iminwr = iwork[1]; + if (lquery) { + olwdmd = (integer) work[2]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwdmd; + olwork = f2cmax(i__1,i__2); + } + if (wntvec || wntvcf) { + mlwmqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + sormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], & + z__[z_offset], ldz, &work[1], &c_n1, &info1); + olwmqr = (integer) work[1]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr; + olwork = f2cmax(i__1,i__2); + } + } + if (wantq) { + mlwgqr = *n; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[ + 1], &c_n1, &info1); + olwgqr = (integer) work[1]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr; + olwork = f2cmax(i__1,i__2); + } + } + iminwr = f2cmax(1,iminwr); + mlwork = f2cmax(2,mlwork); + if (*lwork < mlwork && ! lquery) { + *info = -31; + } + if (*liwork < iminwr && ! lquery) { + *info = -33; + } + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGEDMDQ", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + work[1] = (real) mlwork; + work[2] = (real) olwork; + return 0; + } +/* ..... */ +/* Initial QR factorization that is used to represent the */ +/* snapshots as elements of lower dimensional subspace. */ +/* For large scale computation with M >>N , at this place */ +/* one can use an out of core QRF. */ + + i__1 = *lwork - minmn; + sgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, & + info1); + +/* Define X and Y as the snapshots representations in the */ +/* orthogonal basis computed in the QR factorization. */ +/* X corresponds to the leading N-1 and Y to the trailing */ +/* N-1 snapshots. */ + i__1 = *n - 1; + slaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx); + i__1 = *n - 1; + slacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx); + i__1 = *n - 1; + slacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy); + if (*m >= 3) { + i__1 = minmn - 2; + i__2 = *n - 2; + slaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy); + } + +/* Compute the DMD of the projected snapshot pairs (X,Y) */ + i__1 = *n - 1; + i__2 = *lwork - minmn; + sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx, + &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[ + z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[ + s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, & + info1); + if (info1 == 2 || info1 == 3) { +/* Return with error code. */ + *info = info1; + return 0; + } else { + *info = info1; + } + +/* The Ritz vectors (Koopman modes) can be explicitly */ +/* formed or returned in factored form. */ + if (wntvec) { +/* Compute the eigenvectors explicitly. */ + if (*m > minmn) { + i__1 = *m - minmn; + slaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], + ldz); + } + i__1 = *lwork - (minmn + *n - 1); + sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[ + z_offset], ldz, &work[minmn + *n], &i__1, &info1); + } else if (wntvcf) { +/* Return the Ritz vectors (eigenvectors) in factored */ +/* form Z*V, where Z contains orthonormal matrix (the */ +/* product of Q from the initial QR factorization and */ +/* the SVD/POD_basis returned by SGEDMD in X) and the */ +/* second factor (the eigenvectors of the Rayleigh */ +/* quotient) is in the array V, as returned by SGEDMD. */ + slacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz); + if (*m > *n) { + i__1 = *m - *n; + slaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz); + } + i__1 = *lwork - (minmn + *n - 1); + sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[ + z_offset], ldz, &work[minmn + *n], &i__1, &info1); + } + +/* Some optional output variables: */ + +/* The upper triangular factor in the initial QR */ +/* factorization is optionally returned in the array Y. */ +/* This is useful if this call to SGEDMDQ is to be */ +/* followed by a streaming DMD that is implemented in a */ +/* QR compressed form. */ + if (wnttrf) { +/* Return the upper triangular R in Y */ + slaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy); + slacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy); + } + +/* The orthonormal/orthogonal factor in the initial QR */ +/* factorization is optionally returned in the array F. */ +/* Same as with the triangular factor above, this is */ +/* useful in a streaming DMD. */ + if (wantq) { +/* Q overwrites F */ + i__1 = *lwork - (minmn + *n - 1); + sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + + *n], &i__1, &info1); + } + + return 0; + +} /* sgedmdq_ */ + diff --git a/lapack-netlib/SRC/sgedmdq.f90 b/lapack-netlib/SRC/sgedmdq.f90 new file mode 100644 index 000000000..acd5d56c6 --- /dev/null +++ b/lapack-netlib/SRC/sgedmdq.f90 @@ -0,0 +1,703 @@ +SUBROUTINE SGEDMDQ( JOBS, JOBZ, JOBR, JOBQ, JOBT, JOBF, & + WHTSVD, M, N, F, LDF, X, LDX, Y, & + LDY, NRNK, TOL, K, REIG, IMEIG, & + Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, WORK, LWORK, IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBQ, & + JOBT, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDF, LDX, & + LDY, NRNK, LDZ, LDB, LDV, & + LDS, LWORK, LIWORK + INTEGER, INTENT(OUT) :: INFO, K + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*) + REAL(KIND=WP), INTENT(OUT) :: X(LDX,*), Y(LDY,*), & + Z(LDZ,*), B(LDB,*), & + V(LDV,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!..... +! Purpose +! ======= +! SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices, using a QR factorization +! based compression of the data. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, SGEDMDQ computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, SGEDMDQ returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office. +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. The data snapshots are the columns +! of F. The leading N-1 columns of F are denoted X and the +! trailing N-1 columns are denoted Y. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Z*V, where Z +! is orthonormal and V contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of F, V, Z. +! 'Q' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Q*Z, where Z +! contains the eigenvectors of the compression of the +! underlying discretized operator onto the span of +! the data snapshots. See the descriptions of F, V, Z. +! Q is from the initial QR factorization. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will +! be computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBQ (input) CHARACTER*1 +! Specifies whether to explicitly compute and return the +! orthogonal matrix from the QR factorization. +! 'Q' :: The matrix Q of the QR factorization of the data +! snapshot matrix is computed and stored in the +! array F. See the description of F. +! 'N' :: The matrix Q is not explicitly computed. +!..... +! JOBT (input) CHARACTER*1 +! Specifies whether to return the upper triangular factor +! from the QR factorization. +! 'R' :: The matrix R of the QR factorization of the data +! snapshot matrix F is returned in the array Y. +! See the description of Y and Further details. +! 'N' :: The matrix R is not returned. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +! To be useful on exit, this option needs JOBQ='Q'. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: SGESVD (the QR SVD algorithm) +! 2 :: SGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M >= 0 +! The state space dimension (the number of rows of F) +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshots from a single trajectory, +! taken at equidistant discrete times. This is the +! number of columns of F. +!..... +! F (input/output) REAL(KIND=WP) M-by-N array +! > On entry, +! the columns of F are the sequence of data snapshots +! from a single trajectory, taken at equidistant discrete +! times. It is assumed that the column norms of F are +! in the range of the normalized floating point numbers. +! < On exit, +! If JOBQ == 'Q', the array F contains the orthogonal +! matrix/factor of the QR factorization of the initial +! data snapshots matrix F. See the description of JOBQ. +! If JOBQ == 'N', the entries in F strictly below the main +! diagonal contain, column-wise, the information on the +! Householder vectors, as returned by SGEQRF. The +! remaining information to restore the orthogonal matrix +! of the initial QR factorization is stored in WORK(1:N). +! See the description of WORK. +!..... +! LDF (input) INTEGER, LDF >= M +! The leading dimension of the array F. +!..... +! X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array +! X is used as workspace to hold representations of the +! leading N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, the leading K columns of X contain the leading +! K left singular vectors of the above described content +! of X. To lift them to the space of the left singular +! vectors U(:,1:K)of the input data, pre-multiply with the +! Q factor from the initial QR factorization. +! See the descriptions of F, K, V and Z. +!..... +! LDX (input) INTEGER, LDX >= N +! The leading dimension of the array X +!..... +! Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array +! Y is used as workspace to hold representations of the +! trailing N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, +! If JOBT == 'R', Y contains the MIN(M,N)-by-N upper +! triangular factor from the QR factorization of the data +! snapshot matrix F. +!..... +! LDY (input) INTEGER , LDY >= N +! The leading dimension of the array Y +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N-1 :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the description of K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the SVD/POD basis for the leading N-1 +! data snapshots (columns of F) and the number of the +! computed Ritz pairs. The value of K is determined +! according to the rule set by the parameters NRNK and +! TOL. See the descriptions of NRNK and TOL. +!..... +! REIG (output) REAL(KIND=WP) (N-1)-by-1 array +! The leading K (K<=N) entries of REIG contain +! the real parts of the computed eigenvalues +! REIG(1:K) + sqrt(-1)*IMEIG(1:K). +! See the descriptions of K, IMEIG, Z. +!..... +! IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array +! The leading K (K0, then the corresponding complex +! conjugate pair of eigenvalues reads +! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +! That is, complex conjugate pairs have consecutive +! indices (i,i+1), with the positive imaginary part +! listed first. +! See the descriptions of K, REIG, Z. +!..... +! Z (workspace/output) REAL(KIND=WP) M-by-(N-1) array +! If JOBZ =='V' then +! Z contains real Ritz vectors as follows: +! If IMEIG(i)=0, then Z(:,i) is an eigenvector of +! the i-th Ritz value. +! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +! [Z(:,i) Z(:,i+1)] span an invariant subspace and +! the Ritz values extracted from this subspace are +! REIG(i) + sqrt(-1)*IMEIG(i) and +! REIG(i) - sqrt(-1)*IMEIG(i). +! The corresponding eigenvectors are +! Z(:,i) + sqrt(-1)*Z(:,i+1) and +! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +! If JOBZ == 'F', then the above descriptions hold for +! the columns of Z*V, where the columns of V are the +! eigenvectors of the K-by-K Rayleigh quotient, and Z is +! orthonormal. The columns of V are similarly structured: +! If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if +! IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and +! Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) +! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +! See the descriptions of REIG, IMEIG, X and V. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) (N-1)-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs. +! If LAMBDA(i) is real, then +! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +! then +! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +! It holds that +! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +! See the description of Z. +!..... +! B (output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array. +! IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:N,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! In both cases, the content of B can be lifted to the +! original dimension of the input data by pre-multiplying +! with the Q factor from the initial QR factorization. +! Here A denotes a compression of the underlying operator. +! See the descriptions of F and X. +! If JOBF =='N', then B is not referenced. +!..... +! LDB (input) INTEGER, LDB >= MIN(M,N) +! The leading dimension of the array B. +!..... +! V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array +! On exit, V(1:K,1:K) contains the K eigenvectors of +! the Rayleigh quotient. The eigenvectors of a complex +! conjugate pair of eigenvalues are returned in real form +! as explained in the description of Z. The Ritz vectors +! (returned in Z) are the product of X and V; see +! the descriptions of X and Z. +!..... +! LDV (input) INTEGER, LDV >= N-1 +! The leading dimension of the array V. +!..... +! S (output) REAL(KIND=WP) (N-1)-by-(N-1) array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by SGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N-1 +! The leading dimension of the array S. +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, +! WORK(1:MIN(M,N)) contains the scalar factors of the +! elementary reflectors as returned by SGEQRF of the +! M-by-N input matrix F. +! WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of +! the input submatrix F(1:M,1:N-1). +! If the call to SGEDMDQ is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. Hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is calculated as follows: +! Let MLWQR = N (minimal workspace for SGEQRF[M,N]) +! MLWDMD = minimal workspace for SGEDMD (see the +! description of LWORK in SGEDMD) for +! snapshots of dimensions MIN(M,N)-by-(N-1) +! MLWMQR = N (minimal workspace for +! SORMQR['L','N',M,N,N]) +! MLWGQR = N (minimal workspace for SORGQR[M,N,N]) +! Then +! LWORK = MAX(N+MLWQR, N+MLWDMD) +! is updated as follows: +! if JOBZ == 'V' or JOBZ == 'F' THEN +! LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR ) +! if JOBQ == 'Q' THEN +! LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR) +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! Let M1=MIN(M,N), N1=N-1. Then +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) +! If on entry LIWORK = -1, then a worskpace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! +! Local scalars +! ~~~~~~~~~~~~~ + INTEGER :: IMINWR, INFO1, MLWDMD, MLWGQR, & + MLWMQR, MLWORK, MLWQR, MINMN, & + OLWDMD, OLWGQR, OLWMQR, OLWORK, & + OLWQR + LOGICAL :: LQUERY, SCCOLX, SCCOLY, WANTQ, & + WNTTRF, WNTRES, WNTVEC, WNTVCF, & + WNTVCQ, WNTREF, WNTEX + CHARACTER(LEN=1) :: JOBVL +! +! Local array +! ~~~~~~~~~~~ + REAL(KIND=WP) :: RDUMMY(2) +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + LOGICAL LSAME + EXTERNAL LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL SGEMM + EXTERNAL SGEQRF, SLACPY, SLASET, SORGQR, & + SORMQR, XERBLA + +! External subroutines +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL SGEDMD + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC MAX, MIN, INT + !.......................................................... + ! + ! Test the input arguments + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' ) + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTVCF = LSAME(JOBZ,'F') + WNTVCQ = LSAME(JOBZ,'Q') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + WANTQ = LSAME(JOBQ,'Q') + WNTTRF = LSAME(JOBT,'R') + MINMN = MIN(M,N) + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ & + .OR. LSAME(JOBZ,'N')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN + INFO = -4 + ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) ) THEN + INFO = -5 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -6 + ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR. & + (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN + INFO = -7 + ELSE IF ( M < 0 ) THEN + INFO = -8 + ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN + INFO = -9 + ELSE IF ( LDF < M ) THEN + INFO = -11 + ELSE IF ( LDX < MINMN ) THEN + INFO = -13 + ELSE IF ( LDY < MINMN ) THEN + INFO = -15 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -16 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -17 + ELSE IF ( LDZ < M ) THEN + INFO = -22 + ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN + INFO = -25 + ELSE IF ( LDV < N-1 ) THEN + INFO = -27 + ELSE IF ( LDS < N-1 ) THEN + INFO = -29 + END IF +! + IF ( WNTVEC .OR. WNTVCF ) THEN + JOBVL = 'V' + ELSE + JOBVL = 'N' + END IF + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN + ! All output except K is void. INFO=1 signals + ! the void input. In case of a workspace query, + ! the minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + MLWQR = MAX(1,N) ! Minimal workspace length for SGEQRF. + MLWORK = MIN(M,N) + MLWQR + IF ( LQUERY ) THEN + CALL SGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, & + INFO1 ) + OLWQR = INT(RDUMMY(1)) + OLWORK = MIN(M,N) + OLWQR + END IF + CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + REIG, IMEIG, Z, LDZ, RES, B, LDB, & + V, LDV, S, LDS, WORK, -1, IWORK, & + LIWORK, INFO1 ) + MLWDMD = INT(WORK(1)) + MLWORK = MAX(MLWORK, MINMN + MLWDMD) + IMINWR = IWORK(1) + IF ( LQUERY ) THEN + OLWDMD = INT(WORK(2)) + OLWORK = MAX(OLWORK, MINMN+OLWDMD) + END IF + IF ( WNTVEC .OR. WNTVCF ) THEN + MLWMQR = MAX(1,N) + MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR) + IF ( LQUERY ) THEN + CALL SORMQR( 'L','N', M, N, MINMN, F, LDF, & + WORK, Z, LDZ, WORK, -1, INFO1 ) + OLWMQR = INT(WORK(1)) + OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR) + END IF + END IF + IF ( WANTQ ) THEN + MLWGQR = N + MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR) + IF ( LQUERY ) THEN + CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, & + WORK, -1, INFO1 ) + OLWGQR = INT(WORK(1)) + OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR) + END IF + END IF + IMINWR = MAX( 1, IMINWR ) + MLWORK = MAX( 2, MLWORK ) + IF ( LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31 + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33 + END IF + IF( INFO /= 0 ) THEN + CALL XERBLA( 'SGEDMDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + WORK(1) = MLWORK + WORK(2) = OLWORK + RETURN + END IF +!..... +! Initial QR factorization that is used to represent the +! snapshots as elements of lower dimensional subspace. +! For large scale computation with M >>N , at this place +! one can use an out of core QRF. +! + CALL SGEQRF( M, N, F, LDF, WORK, & + WORK(MINMN+1), LWORK-MINMN, INFO1 ) +! +! Define X and Y as the snapshots representations in the +! orthogonal basis computed in the QR factorization. +! X corresponds to the leading N-1 and Y to the trailing +! N-1 snapshots. + CALL SLASET( 'L', MINMN, N-1, ZERO, ZERO, X, LDX ) + CALL SLACPY( 'U', MINMN, N-1, F, LDF, X, LDX ) + CALL SLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY ) + IF ( M >= 3 ) THEN + CALL SLASET( 'L', MINMN-2, N-2, ZERO, ZERO, & + Y(3,1), LDY ) + END IF +! +! Compute the DMD of the projected snapshot pairs (X,Y) + CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, & + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + REIG, IMEIG, Z, LDZ, RES, B, LDB, V, & + LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, IWORK, & + LIWORK, INFO1 ) + IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN + ! Return with error code. + INFO = INFO1 + RETURN + ELSE + INFO = INFO1 + END IF +! +! The Ritz vectors (Koopman modes) can be explicitly +! formed or returned in factored form. + IF ( WNTVEC ) THEN + ! Compute the eigenvectors explicitly. + IF ( M > MINMN ) CALL SLASET( 'A', M-MINMN, K, ZERO, & + ZERO, Z(MINMN+1,1), LDZ ) + CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z, & + LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + ELSE IF ( WNTVCF ) THEN + ! Return the Ritz vectors (eigenvectors) in factored + ! form Z*V, where Z contains orthonormal matrix (the + ! product of Q from the initial QR factorization and + ! the SVD/POD_basis returned by SGEDMD in X) and the + ! second factor (the eigenvectors of the Rayleigh + ! quotient) is in the array V, as returned by SGEDMD. + CALL SLACPY( 'A', N, K, X, LDX, Z, LDZ ) + IF ( M > N ) CALL SLASET( 'A', M-N, K, ZERO, ZERO, & + Z(N+1,1), LDZ ) + CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z, & + LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + END IF +! +! Some optional output variables: +! +! The upper triangular factor in the initial QR +! factorization is optionally returned in the array Y. +! This is useful if this call to SGEDMDQ is to be +! followed by a streaming DMD that is implemented in a +! QR compressed form. + IF ( WNTTRF ) THEN ! Return the upper triangular R in Y + CALL SLASET( 'A', MINMN, N, ZERO, ZERO, Y, LDY ) + CALL SLACPY( 'U', MINMN, N, F, LDF, Y, LDY ) + END IF +! +! The orthonormal/orthogonal factor in the initial QR +! factorization is optionally returned in the array F. +! Same as with the triangular factor above, this is +! useful in a streaming DMD. + IF ( WANTQ ) THEN ! Q overwrites F + CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, & + WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + END IF +! + RETURN +! + END SUBROUTINE SGEDMDQ + \ No newline at end of file diff --git a/lapack-netlib/SRC/sgees.f b/lapack-netlib/SRC/sgees.f index 6febd549c..4418ea064 100644 --- a/lapack-netlib/SRC/sgees.f +++ b/lapack-netlib/SRC/sgees.f @@ -208,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup gees * * ===================================================================== SUBROUTINE SGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, WR, WI, @@ -250,14 +250,14 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, - $ SLACPY, SLASCL, SORGHR, SSWAP, STRSEN, XERBLA + EXTERNAL SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY, + $ SLASCL, SORGHR, SSWAP, STRSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -312,7 +312,7 @@ MAXWRK = MAX( MAXWRK, N + HSWORK ) END IF END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -13 @@ -338,7 +338,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -524,7 +523,7 @@ 30 CONTINUE END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGEES diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f index 6810fe7c8..cabe9f1f7 100644 --- a/lapack-netlib/SRC/sgeesx.f +++ b/lapack-netlib/SRC/sgeesx.f @@ -272,7 +272,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup geesx * * ===================================================================== SUBROUTINE SGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, @@ -317,14 +317,14 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, + EXTERNAL SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, $ SLACPY, SLASCL, SORGHR, SSWAP, STRSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -398,7 +398,7 @@ $ LIWRK = ( N*N )/4 END IF IWORK( 1 ) = LIWRK - WORK( 1 ) = LWRK + WORK( 1 ) = SROUNDUP_LWORK(LWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -16 @@ -426,7 +426,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -634,7 +633,7 @@ 30 CONTINUE END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IF( WANTSV .OR. WANTSB ) THEN IWORK( 1 ) = SDIM*(N-SDIM) ELSE diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f index ed1724721..93f993265 100644 --- a/lapack-netlib/SRC/sgeev.f +++ b/lapack-netlib/SRC/sgeev.f @@ -184,7 +184,7 @@ * * @generated from dgeev.f, fortran d -> s, Tue Apr 19 01:47:44 2016 * -*> \ingroup realGEeigen +*> \ingroup geev * * ===================================================================== SUBROUTINE SGEEV( JOBVL, JOBVR, N, A, LDA, WR, WI, VL, LDVL, VR, @@ -223,16 +223,15 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, SLACPY, - $ SLARTG, SLASCL, SORGHR, SROT, SSCAL, STREVC3, - $ XERBLA + EXTERNAL SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY, SLARTG, + $ SLASCL, SORGHR, SROT, SSCAL, STREVC3, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, ILAENV - REAL SLAMCH, SLANGE, SLAPY2, SNRM2 + REAL SLAMCH, SLANGE, SLAPY2, SNRM2, SROUNDUP_LWORK EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SLANGE, SLAPY2, - $ SNRM2 + $ SNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -312,7 +311,7 @@ END IF MAXWRK = MAX( MAXWRK, MINWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -13 @@ -336,7 +335,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -519,7 +517,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGEEV diff --git a/lapack-netlib/SRC/sgeevx.f b/lapack-netlib/SRC/sgeevx.f index ed1ea1cb9..b0af78605 100644 --- a/lapack-netlib/SRC/sgeevx.f +++ b/lapack-netlib/SRC/sgeevx.f @@ -297,7 +297,7 @@ * * @generated from dgeevx.f, fortran d -> s, Tue Apr 19 01:47:44 2016 * -*> \ingroup realGEeigen +*> \ingroup geevx * * ===================================================================== SUBROUTINE SGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, WR, WI, @@ -341,16 +341,16 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, SLACPY, + EXTERNAL SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY, $ SLARTG, SLASCL, SORGHR, SROT, SSCAL, STREVC3, $ STRSNA, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, ILAENV - REAL SLAMCH, SLANGE, SLAPY2, SNRM2 + REAL SLAMCH, SLANGE, SLAPY2, SNRM2, SROUNDUP_LWORK EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SLANGE, SLAPY2, - $ SNRM2 + $ SNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -453,7 +453,7 @@ END IF MAXWRK = MAX( MAXWRK, MINWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -21 @@ -477,7 +477,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -684,7 +683,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGEEVX diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f index 41b9aa78e..cfa17e156 100644 --- a/lapack-netlib/SRC/sgehrd.f +++ b/lapack-netlib/SRC/sgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - REAL A( LDA, * ), TAU( * ), WORK( * ) + REAL A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, $ ONE = 1.0E+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - REAL EI + REAL EI * .. * .. External Subroutines .. EXTERNAL SAXPY, SGEHD2, SGEMM, SLAHR2, SLARFB, STRMM, @@ -201,7 +201,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -221,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = LWKOPT + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -248,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -344,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = LWKOPT +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f index 923573bdb..42d2b8313 100644 --- a/lapack-netlib/SRC/sgejsv.f +++ b/lapack-netlib/SRC/sgejsv.f @@ -253,7 +253,7 @@ *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; *> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), -*> then V is used as workspace if the pprocedure +*> then V is used as workspace if the procedure *> replaces A with A^t. In that case, [U] is computed *> in V as right singular vectors of A^t and then *> copied back to the U array. This 'W' option is just @@ -1386,7 +1386,7 @@ IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 * (this overwrites the copy of R2, as it will not be -* needed in this branch, but it does not overwritte the +* needed in this branch, but it does not overwrite the * Huseholder vectors of Q2.). CALL SLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N ) * .. and the rest of the information on Q3 is in @@ -1409,7 +1409,7 @@ END IF * * Second preconditioning finished; continue with Jacobi SVD -* The input matrix is lower trinagular. +* The input matrix is lower triangular. * * Recover the right singular vectors as solution of a well * conditioned triangular matrix equation. @@ -1454,7 +1454,7 @@ * :) .. the input matrix A is very likely a relative of * the Kahan matrix :) * The matrix R2 is inverted. The solution of the matrix equation -* is Q3^T*V3 = the product of the Jacobi rotations (appplied to +* is Q3^T*V3 = the product of the Jacobi rotations (applied to * the lower triangular L3 from the LQ factorization of * R2=L3*Q3), pre-multiplied with the transposed Q3. CALL SGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 9209f918e..75f02675d 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -190,7 +192,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGELQT, SLASWLQ, XERBLA @@ -292,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = LWREQ + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -319,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = LWREQ + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f index 24d8ab19c..3b3913d84 100644 --- a/lapack-netlib/SRC/sgelqf.f +++ b/lapack-netlib/SRC/sgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -167,16 +168,16 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -266,7 +273,7 @@ $ CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGELQF diff --git a/lapack-netlib/SRC/sgelqt3.f b/lapack-netlib/SRC/sgelqt3.f index 23816b4c8..82f5c1cf4 100644 --- a/lapack-netlib/SRC/sgelqt3.f +++ b/lapack-netlib/SRC/sgelqt3.f @@ -158,7 +158,8 @@ * * Compute Householder transform when M=1 * - CALL SLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL SLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) * ELSE * diff --git a/lapack-netlib/SRC/sgels.f b/lapack-netlib/SRC/sgels.f index ea02c3318..b58f70c9e 100644 --- a/lapack-netlib/SRC/sgels.f +++ b/lapack-netlib/SRC/sgels.f @@ -175,7 +175,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gels * * ===================================================================== SUBROUTINE SGELS( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, @@ -210,15 +210,15 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL SGELQF, SGEQRF, SLABAD, SLASCL, SLASET, SORMLQ, + EXTERNAL SGELQF, SGEQRF, SLASCL, SLASET, SORMLQ, $ SORMQR, STRTRS, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN, REAL + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -273,7 +273,7 @@ END IF * WSIZE = MAX( 1, MN + MAX( MN, NRHS )*NB ) - WORK( 1 ) = REAL( WSIZE ) + WORK( 1 ) = SROUNDUP_LWORK( WSIZE ) * END IF * @@ -295,7 +295,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -492,7 +491,7 @@ END IF * 50 CONTINUE - WORK( 1 ) = REAL( WSIZE ) + WORK( 1 ) = SROUNDUP_LWORK( WSIZE ) * RETURN * diff --git a/lapack-netlib/SRC/sgelsd.f b/lapack-netlib/SRC/sgelsd.f index f5f17d34c..2818213f4 100644 --- a/lapack-netlib/SRC/sgelsd.f +++ b/lapack-netlib/SRC/sgelsd.f @@ -59,12 +59,6 @@ *> singular values which are less than RCOND times the largest singular *> value. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -195,7 +189,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -235,13 +229,13 @@ REAL ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM * .. * .. External Subroutines .. - EXTERNAL SGEBRD, SGELQF, SGEQRF, SLABAD, SLACPY, SLALSD, - $ SLASCL, SLASET, SORMBR, SORMLQ, SORMQR, XERBLA + EXTERNAL SGEBRD, SGELQF, SGEQRF, SLACPY, SLALSD, SLASCL, + $ SLASET, SORMBR, SORMLQ, SORMQR, XERBLA * .. * .. External Functions .. INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL SLAMCH, SLANGE, ILAENV + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL SLAMCH, SLANGE, ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, LOG, MAX, MIN, REAL @@ -354,7 +348,7 @@ END IF END IF MINWRK = MIN( MINWRK, MAXWRK ) - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWORK * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN @@ -382,7 +376,6 @@ SFMIN = SLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * @@ -621,7 +614,7 @@ END IF * 10 CONTINUE - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWORK RETURN * diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index 9aed4329f..2e4b0cdd5 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gelss * * ===================================================================== SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, @@ -202,13 +202,13 @@ * .. * .. External Subroutines .. EXTERNAL SBDSQR, SCOPY, SGEBRD, SGELQF, SGEMM, SGEMV, - $ SGEQRF, SLABAD, SLACPY, SLASCL, SLASET, SORGBR, + $ SGEQRF, SLACPY, SLASCL, SLASET, SORGBR, $ SORMBR, SORMLQ, SORMQR, SRSCL, XERBLA * .. * .. External Functions .. INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -355,7 +355,7 @@ END IF MAXWRK = MAX( MINWRK, MAXWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) $ INFO = -12 @@ -381,7 +381,6 @@ SFMIN = SLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A if max element outside range [SMLNUM,BIGNUM] * @@ -525,7 +524,7 @@ $ LDB, ZERO, WORK, N ) CALL SLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB ) 20 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL SGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 ) CALL SCOPY( N, WORK, 1, B, 1 ) END IF @@ -622,7 +621,7 @@ CALL SLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ), $ LDB ) 40 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL SGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ), $ 1, ZERO, WORK( IWORK ), 1 ) CALL SCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 ) @@ -708,7 +707,7 @@ $ LDB, ZERO, WORK, N ) CALL SLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB ) 60 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL SGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 ) CALL SCOPY( N, WORK, 1, B, 1 ) END IF @@ -732,7 +731,7 @@ END IF * 70 CONTINUE - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGELSS diff --git a/lapack-netlib/SRC/sgelst.f b/lapack-netlib/SRC/sgelst.f index 5377bc720..b89918656 100644 --- a/lapack-netlib/SRC/sgelst.f +++ b/lapack-netlib/SRC/sgelst.f @@ -176,7 +176,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gelst * *> \par Contributors: * ================== @@ -222,15 +222,15 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL SGELQT, SGEQRT, SGEMLQT, SGEMQRT, SLABAD, + EXTERNAL SGELQT, SGEQRT, SGEMLQT, SGEMQRT, $ SLASCL, SLASET, STRTRS, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -268,7 +268,7 @@ * MNNRHS = MAX( MN, NRHS ) LWOPT = MAX( 1, (MN+MNNRHS)*NB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) * END IF * @@ -283,7 +283,7 @@ * IF( MIN( M, N, NRHS ).EQ.0 ) THEN CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) RETURN END IF * @@ -309,7 +309,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -332,7 +331,7 @@ * Matrix all zero. Return zero solution. * CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) RETURN END IF * @@ -522,7 +521,7 @@ $ INFO ) END IF * - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgelsy.f b/lapack-netlib/SRC/sgelsy.f index 9c60f78a7..c7f5069de 100644 --- a/lapack-netlib/SRC/sgelsy.f +++ b/lapack-netlib/SRC/sgelsy.f @@ -115,6 +115,7 @@ *> B is REAL array, dimension (LDB,NRHS) *> On entry, the M-by-NRHS right hand side matrix B. *> On exit, the N-by-NRHS solution matrix X. +*> If M = 0 or N = 0, B is not referenced. *> \endverbatim *> *> \param[in] LDB @@ -147,6 +148,7 @@ *> The effective rank of A, i.e., the order of the submatrix *> R11. This is the same as the order of the submatrix T11 *> in the complete orthogonal factorization of A. +*> If NRHS = 0, RANK = 0 on output. *> \endverbatim *> *> \param[out] WORK @@ -189,7 +191,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gelsy * *> \par Contributors: * ================== @@ -232,11 +234,11 @@ * .. * .. External Functions .. INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEQP3, SLABAD, SLAIC1, SLASCL, SLASET, + EXTERNAL SCOPY, SGEQP3, SLAIC1, SLASCL, SLASET, $ SORMQR, SORMRZ, STRSM, STZRZF, XERBLA * .. * .. Intrinsic Functions .. @@ -280,7 +282,7 @@ LWKOPT = MAX( LWKMIN, $ MN + 2*N + NB*( N + 1 ), 2*MN + NB*NRHS ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 @@ -305,7 +307,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max entries outside range [SMLNUM,BIGNUM] * @@ -467,7 +468,7 @@ END IF * 70 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f index 536abf184..7e4d9bf65 100644 --- a/lapack-netlib/SRC/sgemlq.f +++ b/lapack-netlib/SRC/sgemlq.f @@ -110,16 +110,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -143,7 +144,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -159,11 +160,13 @@ *> block sizes MB and NB returned by ILAENV, SGELQ will use either *> SLASWLQ (if the matrix is wide-and-short) or SGELQT to compute *> the LQ factorization. -*> This version of SGEMLQ will use either SLAMSWLQ or SGEMLQT to +*> This version of SGEMLQ will use either SLAMSWLQ or SGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in SLAMSWLQ or SGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE SGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -185,12 +188,16 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SLAMSWLQ, SGEMLQT, XERBLA * .. @@ -201,7 +208,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -216,6 +223,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -244,12 +258,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -261,7 +275,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -274,7 +288,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f index 2a9257459..19bf467b8 100644 --- a/lapack-netlib/SRC/sgemqr.f +++ b/lapack-netlib/SRC/sgemqr.f @@ -120,7 +120,7 @@ *> The dimension of the array WORK. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +144,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,12 +160,14 @@ *> block sizes MB and NB returned by ILAENV, SGEQR will use either *> SLATSQR (if the matrix is tall-and-skinny) or SGEQRT to compute *> the QR factorization. -*> This version of SGEMQR will use either SLAMTSQR or SGEMQRT to +*> This version of SGEMQR will use either SLAMTSQR or SGEMQRT to *> multiply matrix Q by another matrix. *> Further Details in SLAMTSQR or SGEMQRT. *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE SGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,11 +189,13 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMQRT, SLAMTSQR, XERBLA @@ -203,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +257,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f index efecfbb3c..14942b765 100644 --- a/lapack-netlib/SRC/sgeqlf.f +++ b/lapack-netlib/SRC/sgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -162,7 +163,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -186,10 +188,11 @@ NB = ILAENV( 1, 'SGEQLF', ' ', M, N, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * @@ -276,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL SGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGEQLF diff --git a/lapack-netlib/SRC/sgeqp3.f b/lapack-netlib/SRC/sgeqp3.f index 493bdae6a..9f2f40b2e 100644 --- a/lapack-netlib/SRC/sgeqp3.f +++ b/lapack-netlib/SRC/sgeqp3.f @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup geqp3 * *> \par Further Details: * ===================== @@ -177,8 +177,8 @@ * .. * .. External Functions .. INTEGER ILAENV - REAL SNRM2 - EXTERNAL ILAENV, SNRM2 + REAL SNRM2, SROUNDUP_LWORK + EXTERNAL ILAENV, SNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -205,7 +205,7 @@ NB = ILAENV( INB, 'SGEQRF', ' ', M, N, -1, -1 ) LWKOPT = 2*N + ( N + 1 )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN INFO = -8 @@ -347,7 +347,7 @@ * END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGEQP3 diff --git a/lapack-netlib/SRC/sgeqp3rk.c b/lapack-netlib/SRC/sgeqp3rk.c new file mode 100644 index 000000000..fe52901bf --- /dev/null +++ b/lapack-netlib/SRC/sgeqp3rk.c @@ -0,0 +1,1055 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + work[1] = (real) lwkopt; + return 0; + } + +/* ================================================================== */ + + eps = slamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.f) { + safmin = slamch_("Safe minimum"); +/* Computing MAX */ + r__1 = *abstol, r__2 = safmin * 2.f; + *abstol = f2cmax(r__1,r__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.f) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1.f <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + work[1] = (real) lwkopt; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + slaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], & + work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb + + 1], &i__1, &iwork[1], &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + work[1] = (real) lwkopt; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + slaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], & + work[(*n << 1) + 1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &work[*k + 1], &c__1); + *maxc2nrmk = work[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + work[1] = (real) lwkopt; + + return 0; + +/* End of SGEQP3RK */ + +} /* sgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f new file mode 100644 index 000000000..d3a335b88 --- /dev/null +++ b/lapack-netlib/SRC/sgeqp3rk.f @@ -0,0 +1,1083 @@ +*> \brief \b SGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a real +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M orthogonal matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a real M-by-NRHS +*> matrix B with Q(K)**T * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**T * B, the matrix B with the orthogonal +*> transformation Q(K)**T applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M orthogonal matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**T, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a real scalar, +*> v is a real vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = SLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = SLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> orthogonal matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**T * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. +*> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), +*> where NB is the optimal block size for SGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine SLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> SGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in SGEQP3 routine which uses +*> SLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + REAL ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + REAL EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL SLAQP2RK, SLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX, ILAENV + REAL SLAMCH, SNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( SISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( SISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in SLAQP2RK. +* 1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial +* column 2-norms. +* 2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in SLARF subroutine inside SLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = 3*N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'SGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code +* in SLAQP3RK. +* 1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in SLARF subroutine to apply an elementary reflector +* from the left. +* 3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) SLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + WORK( J ) = SNRM2( M, A( 1, J ), 1 ) + WORK( N+J ) = WORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = ISAMAX( N, WORK( 1 ), 1 ) + MAXC2NRM = WORK( KP1 ) +* +* ==================================================================. +* + IF( SISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = SLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = ZERO + END DO + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = SLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = SLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'SGEQP3RK', ' ', M, N, -1, -1 )) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'SGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL SLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), WORK( 2*N+JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL SLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + ISAMAX( N-K, WORK( K+1 ), 1 ) + MAXC2NRMK = WORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = ZERO + END DO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + RETURN +* +* End of SGEQP3RK +* + END diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f index 6f41a92ea..79a515e1c 100644 --- a/lapack-netlib/SRC/sgeqr.f +++ b/lapack-netlib/SRC/sgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,13 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLATSQR, SGEQRT, XERBLA @@ -244,8 +248,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +259,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +274,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +288,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +315,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f index b24615f7a..689fe1aea 100644 --- a/lapack-netlib/SRC/sgeqrf.f +++ b/lapack-netlib/SRC/sgeqrf.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup geqrf * *> \par Further Details: * ===================== @@ -170,7 +170,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -199,7 +200,7 @@ ELSE LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -274,7 +275,7 @@ $ CALL SGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGEQRF diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f index 03d33654b..37747c512 100644 --- a/lapack-netlib/SRC/sgeqrfp.f +++ b/lapack-netlib/SRC/sgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL SGEQR2P, SLARFB, SLARFT, XERBLA @@ -174,6 +175,8 @@ * .. External Functions .. INTEGER ILAENV EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,8 +184,16 @@ * INFO = 0 NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +201,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -210,7 +221,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -272,7 +283,7 @@ $ CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGEQRFP diff --git a/lapack-netlib/SRC/sgerqf.f b/lapack-netlib/SRC/sgerqf.f index 037cd5345..1d3400a1f 100644 --- a/lapack-netlib/SRC/sgerqf.f +++ b/lapack-netlib/SRC/sgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -163,7 +163,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -187,7 +188,7 @@ NB = ILAENV( 1, 'SGERQF', ' ', M, N, -1, -1 ) LWKOPT = M*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF ( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) @@ -278,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL SGERQ2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGERQF diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f index d3b5e3ba1..d271bb757 100644 --- a/lapack-netlib/SRC/sgesdd.f +++ b/lapack-netlib/SRC/sgesdd.f @@ -55,12 +55,6 @@ *> *> Note that the routine returns VT = V**T, not V. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/sgesvd.f b/lapack-netlib/SRC/sgesvd.f index 83321ffaa..d3fa94582 100644 --- a/lapack-netlib/SRC/sgesvd.f +++ b/lapack-netlib/SRC/sgesvd.f @@ -203,7 +203,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsing +*> \ingroup gesvd * * ===================================================================== SUBROUTINE SGESVD( JOBU, JOBVT, M, N, A, LDA, S, U, LDU, VT, LDVT, @@ -251,8 +251,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -628,7 +628,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -13 @@ -3493,7 +3493,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * RETURN * diff --git a/lapack-netlib/SRC/sgesvdq.f b/lapack-netlib/SRC/sgesvdq.f index ebbfc70b1..6fb328b8c 100644 --- a/lapack-netlib/SRC/sgesvdq.f +++ b/lapack-netlib/SRC/sgesvdq.f @@ -365,7 +365,7 @@ *> an optimal implementation would do all necessary scaling before calling *> CGESVD and the scaling in CGESVD can be switched off. *> 3. Other comments related to code optimization are given in comments in the -*> code, enlosed in [[double brackets]]. +*> code, enclosed in [[double brackets]]. *> \endverbatim * *> \par Bugs, examples and comments diff --git a/lapack-netlib/SRC/sgesvdx.f b/lapack-netlib/SRC/sgesvdx.f index b6495dbd4..8b55b9b2e 100644 --- a/lapack-netlib/SRC/sgesvdx.f +++ b/lapack-netlib/SRC/sgesvdx.f @@ -254,7 +254,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsing +*> \ingroup gesvdx * * ===================================================================== SUBROUTINE SGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, @@ -301,8 +301,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -456,7 +456,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = REAL( MAXWRK ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -19 @@ -822,7 +822,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = REAL( MAXWRK ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * RETURN * diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f index 3f53a5a15..36aed2853 100644 --- a/lapack-netlib/SRC/sgesvj.f +++ b/lapack-netlib/SRC/sgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On entry, *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> Length of WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -351,9 +356,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. REAL FASTR( 5 ) @@ -369,8 +374,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -394,6 +399,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -413,7 +426,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -423,11 +436,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f index 749ede9a7..7b06bb63d 100644 --- a/lapack-netlib/SRC/sgetri.f +++ b/lapack-netlib/SRC/sgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE SGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -138,6 +138,8 @@ * .. External Functions .. INTEGER ILAENV EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA @@ -151,8 +153,9 @@ * INFO = 0 NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -250,7 +253,7 @@ $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGETRI diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index e6ce705fa..08a427a8b 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE SGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -188,15 +188,15 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANGE - EXTERNAL LSAME, SLABAD, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEQR, SGEMQR, SLASCL, SLASET, $ STRTRS, XERBLA, SGELQ, SGEMLQ * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN, INT + INTRINSIC MAX, MIN, INT * .. * .. Executable Statements .. * @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL SGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -262,7 +265,7 @@ INFO = -10 END IF * - WORK( 1 ) = REAL( WSIZEO ) + WORK( 1 ) = SROUNDUP_LWORK( WSIZEO ) * END IF * @@ -271,7 +274,7 @@ RETURN END IF IF( LQUERY ) THEN - IF( LWORK.EQ.-2 ) WORK( 1 ) = REAL( WSIZEM ) + IF( LWORK.EQ.-2 ) WORK( 1 ) = SROUNDUP_LWORK( WSIZEM ) RETURN END IF IF( LWORK.LT.WSIZEO ) THEN @@ -294,7 +297,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -482,7 +484,7 @@ END IF * 50 CONTINUE - WORK( 1 ) = REAL( TSZO + LWO ) + WORK( 1 ) = SROUNDUP_LWORK( TSZO + LWO ) RETURN * * End of SGETSLS diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f index f9580da7b..7ade8a66c 100644 --- a/lapack-netlib/SRC/sgetsqrhrt.f +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -200,6 +203,10 @@ INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL, $ XERBLA @@ -212,7 +219,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +232,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +270,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -277,14 +285,14 @@ CALL XERBLA( 'SGETSQRHRT', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -341,9 +349,9 @@ END IF END DO * - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of SGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/sgges.f b/lapack-netlib/SRC/sgges.f index 3834aea00..8f42882dd 100644 --- a/lapack-netlib/SRC/sgges.f +++ b/lapack-netlib/SRC/sgges.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE SGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -321,15 +321,14 @@ REAL DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGSEN * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -406,7 +405,7 @@ MINWRK = 1 MAXWRK = 1 END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) $ INFO = -19 @@ -431,7 +430,6 @@ EPS = SLAMCH( 'P' ) SAFMIN = SLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * @@ -668,7 +666,7 @@ * 40 CONTINUE * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * RETURN * diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f index b27704ff5..e90cd6947 100644 --- a/lapack-netlib/SRC/sgges3.f +++ b/lapack-netlib/SRC/sgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -273,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE SGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -318,14 +321,13 @@ REAL DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANGE - EXTERNAL LSAME, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -362,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -378,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -386,7 +394,7 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) @@ -408,7 +416,11 @@ $ IERR ) LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = LWKOPT + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -422,6 +434,7 @@ * IF( N.EQ.0 ) THEN SDIM = 0 + WORK( 1 ) = 1 RETURN END IF * @@ -430,7 +443,6 @@ EPS = SLAMCH( 'P' ) SAFMIN = SLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * @@ -659,7 +671,7 @@ * 40 CONTINUE * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggesx.f b/lapack-netlib/SRC/sggesx.f index a6c0443ba..e5a14fc19 100644 --- a/lapack-netlib/SRC/sggesx.f +++ b/lapack-netlib/SRC/sggesx.f @@ -337,7 +337,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup ggesx * *> \par Further Details: * ===================== @@ -405,15 +405,14 @@ REAL DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -510,7 +509,7 @@ MAXWRK = 1 LWRK = 1 END IF - WORK( 1 ) = LWRK + WORK( 1 ) = SROUNDUP_LWORK(LWRK) IF( WANTSN .OR. N.EQ.0 ) THEN LIWMIN = 1 ELSE @@ -544,7 +543,6 @@ EPS = SLAMCH( 'P' ) SAFMIN = SLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * @@ -807,7 +805,7 @@ * 50 CONTINUE * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sggev.f b/lapack-netlib/SRC/sggev.f index 69744b72b..cacad7cac 100644 --- a/lapack-netlib/SRC/sggev.f +++ b/lapack-netlib/SRC/sggev.f @@ -218,7 +218,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup ggev * * ===================================================================== SUBROUTINE SGGEV( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, ALPHAI, @@ -257,15 +257,14 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGEVC, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -335,7 +334,7 @@ MAXWRK = MAX( MAXWRK, N*( 7 + $ ILAENV( 1, 'SORGQR', ' ', N, 1, N, -1 ) ) ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) $ INFO = -16 @@ -358,7 +357,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -581,7 +579,7 @@ CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGGEV diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f index 945c3a017..d788d1147 100644 --- a/lapack-netlib/SRC/sggev3.f +++ b/lapack-netlib/SRC/sggev3.f @@ -189,6 +189,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -217,7 +219,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE SGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -256,14 +259,13 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGEVC * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANGE - EXTERNAL LSAME, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -299,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -313,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -321,28 +324,31 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, 8*N, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL SORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL SLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = REAL( LWKOPT ) -* END IF * IF( INFO.NE.0 ) THEN @@ -362,7 +368,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -578,7 +583,7 @@ CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SGGEV3 diff --git a/lapack-netlib/SRC/sggevx.f b/lapack-netlib/SRC/sggevx.f index bb05f499a..63164a021 100644 --- a/lapack-netlib/SRC/sggevx.f +++ b/lapack-netlib/SRC/sggevx.f @@ -352,7 +352,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup ggevx * *> \par Further Details: * ===================== @@ -427,15 +427,15 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC, - $ STGSNA, XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGEVC, STGSNA, + $ XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -532,7 +532,7 @@ $ N*ILAENV( 1, 'SORGQR', ' ', N, 1, N, 0 ) ) END IF END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -26 @@ -557,7 +557,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -855,7 +854,7 @@ CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGGEVX diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index 56b4dba52..37094e4f2 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -177,7 +177,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup ggglm * * ===================================================================== SUBROUTINE SGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK, @@ -212,7 +212,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -251,7 +252,7 @@ LWKMIN = M + N + P LWKOPT = M + NP + MAX( N, P )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f index 23acf6ec5..01e57088a 100644 --- a/lapack-netlib/SRC/sgghd3.f +++ b/lapack-netlib/SRC/sgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -211,7 +211,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -260,14 +260,15 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGGHRD, SLARTG, SLASET, SORM22, SROT, SGEMM, $ SGEMV, STRMV, SLACPY, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX + INTRINSIC MAX * .. * .. Executable Statements .. * @@ -275,8 +276,13 @@ * INFO = 0 NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) - WORK( 1 ) = REAL( LWKOPT ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) INITZ = LSAME( COMPZ, 'I' ) @@ -325,7 +331,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -885,7 +890,8 @@ IF ( JCOL.LT.IHI ) $ CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) - WORK( 1 ) = REAL( LWKOPT ) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgglse.f b/lapack-netlib/SRC/sgglse.f index 59addc3f4..53e3f8e45 100644 --- a/lapack-netlib/SRC/sgglse.f +++ b/lapack-netlib/SRC/sgglse.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERsolve +*> \ingroup gglse * * ===================================================================== SUBROUTINE SGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK, @@ -207,7 +207,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -246,7 +247,7 @@ LWKMIN = M + N + P LWKOPT = P + MN + MAX( M, N )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index 59b498da5..d32b48410 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -237,6 +237,8 @@ * .. External Functions .. INTEGER ILAENV EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -250,8 +252,9 @@ NB2 = ILAENV( 1, 'SGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -287,7 +290,9 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) ) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index 8b7d4786a..b3842ec2a 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -235,7 +235,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -249,8 +250,8 @@ NB2 = ILAENV( 1, 'SGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -287,7 +288,8 @@ * QR factorization of P-by-N matrix B: B = Z*T * CALL SGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f index 9077f2ea8..cee630593 100644 --- a/lapack-netlib/SRC/sggsvd3.f +++ b/lapack-netlib/SRC/sggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -328,7 +328,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== @@ -372,8 +372,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANGE - EXTERNAL LSAME, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SGGSVP3, STGSJA, XERBLA @@ -429,7 +429,7 @@ LWKOPT = N + INT( WORK( 1 ) ) LWKOPT = MAX( 2*N, LWKOPT ) LWKOPT = MAX( 1, LWKOPT ) - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -492,7 +492,7 @@ END IF 20 CONTINUE * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SGGSVD3 diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f index 4f76b32bc..8e90d770c 100644 --- a/lapack-netlib/SRC/sggsvp3.f +++ b/lapack-netlib/SRC/sggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== @@ -301,6 +301,8 @@ * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT, @@ -365,7 +367,7 @@ CALL SGEQP3( M, N, A, LDA, IWORK, TAU, WORK, -1, INFO ) LWKOPT = MAX( LWKOPT, INT( WORK ( 1 ) ) ) LWKOPT = MAX( 1, LWKOPT ) - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -560,7 +562,7 @@ * END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SGGSVP3 diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f index 23b6f0077..7e5fddeb0 100644 --- a/lapack-netlib/SRC/sgsvj0.f +++ b/lapack-netlib/SRC/sgsvj0.f @@ -52,10 +52,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is REAL array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f index 0345ccb42..f1d1f7239 100644 --- a/lapack-netlib/SRC/sgsvj1.f +++ b/lapack-netlib/SRC/sgsvj1.f @@ -75,10 +75,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is REAL array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/shgeqz.f b/lapack-netlib/SRC/shgeqz.f index 6543f8cb1..9ad64d2bf 100644 --- a/lapack-netlib/SRC/shgeqz.f +++ b/lapack-netlib/SRC/shgeqz.f @@ -282,7 +282,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup hgeqz * *> \par Further Details: * ===================== @@ -346,8 +346,9 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANHS, SLAPY2, SLAPY3 - EXTERNAL LSAME, SLAMCH, SLANHS, SLAPY2, SLAPY3 + REAL SLAMCH, SLANHS, SLAPY2, SLAPY3, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANHS, SLAPY2, SLAPY3, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLAG2, SLARFG, SLARTG, SLASET, SLASV2, SROT, @@ -1364,7 +1365,7 @@ * Exit (other than argument error) -- return optimal workspace size * 420 CONTINUE - WORK( 1 ) = REAL( N ) + WORK( 1 ) = SROUNDUP_LWORK( N ) RETURN * * End of SHGEQZ diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f index 3b8d4c4d8..68b9fe6bd 100644 --- a/lapack-netlib/SRC/shseqr.f +++ b/lapack-netlib/SRC/shseqr.f @@ -233,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup hseqr * *> \par Contributors: * ================== @@ -358,7 +358,8 @@ * .. External Functions .. INTEGER ILAENV LOGICAL LSAME - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLACPY, SLAHQR, SLAQR0, SLASET, XERBLA @@ -373,7 +374,7 @@ WANTT = LSAME( JOB, 'S' ) INITZ = LSAME( COMPZ, 'I' ) WANTZ = INITZ .OR. LSAME( COMPZ, 'V' ) - WORK( 1 ) = REAL( MAX( 1, N ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N ) ) LQUERY = LWORK.EQ.-1 * INFO = 0 diff --git a/lapack-netlib/SRC/sla_gbrfsx_extended.f b/lapack-netlib/SRC/sla_gbrfsx_extended.f index 499d6bf90..c79330cb5 100644 --- a/lapack-netlib/SRC/sla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/sla_gbrfsx_extended.f @@ -644,7 +644,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/sla_gerfsx_extended.f b/lapack-netlib/SRC/sla_gerfsx_extended.f index de05d8eb4..1a19a5071 100644 --- a/lapack-netlib/SRC/sla_gerfsx_extended.f +++ b/lapack-netlib/SRC/sla_gerfsx_extended.f @@ -628,7 +628,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL SAXPY( N, 1.0, DY, 1, Y( 1, J ), 1 ) diff --git a/lapack-netlib/SRC/sla_porfsx_extended.f b/lapack-netlib/SRC/sla_porfsx_extended.f index ada4cad21..9f33b14dd 100644 --- a/lapack-netlib/SRC/sla_porfsx_extended.f +++ b/lapack-netlib/SRC/sla_porfsx_extended.f @@ -617,7 +617,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/sla_porpvgrw.f b/lapack-netlib/SRC/sla_porpvgrw.f index a97e7f72a..8064bf7fa 100644 --- a/lapack-netlib/SRC/sla_porpvgrw.f +++ b/lapack-netlib/SRC/sla_porpvgrw.f @@ -132,9 +132,9 @@ * UPPER = LSAME( 'Upper', UPLO ) * -* SPOTRF will have factored only the NCOLSxNCOLS leading minor, so -* we restrict the growth search to that minor and use only the first -* 2*NCOLS workspace entries. +* SPOTRF will have factored only the NCOLSxNCOLS leading submatrix, +* so we restrict the growth search to that submatrix and use only +* the first 2*NCOLS workspace entries. * RPVGRW = 1.0 DO I = 1, 2*NCOLS diff --git a/lapack-netlib/SRC/sla_syrfsx_extended.f b/lapack-netlib/SRC/sla_syrfsx_extended.f index d5096be02..2fa3b1c50 100644 --- a/lapack-netlib/SRC/sla_syrfsx_extended.f +++ b/lapack-netlib/SRC/sla_syrfsx_extended.f @@ -647,7 +647,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/slaed2.f b/lapack-netlib/SRC/slaed2.f index 16500e74c..cadf53555 100644 --- a/lapack-netlib/SRC/slaed2.f +++ b/lapack-netlib/SRC/slaed2.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W, +* SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W, * Q2, INDX, INDXC, INDXP, COLTYP, INFO ) * * .. Scalar Arguments .. @@ -28,7 +28,7 @@ * .. Array Arguments .. * INTEGER COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ), * $ INDXQ( * ) -* REAL D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), +* REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), * $ W( * ), Z( * ) * .. * @@ -123,9 +123,9 @@ *> process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (N) +*> DLAMBDA is REAL array, dimension (N) *> A copy of the first K eigenvalues which will be used by *> SLAED3 to form the secular equation. *> \endverbatim @@ -148,7 +148,7 @@ *> \param[out] INDX *> \verbatim *> INDX is INTEGER array, dimension (N) -*> The permutation used to sort the contents of DLAMDA into +*> The permutation used to sort the contents of DLAMBDA into *> ascending order. *> \endverbatim *> @@ -207,7 +207,7 @@ *> Modified by Francoise Tisseur, University of Tennessee *> * ===================================================================== - SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W, + SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W, $ Q2, INDX, INDXC, INDXP, COLTYP, INFO ) * * -- LAPACK computational routine -- @@ -221,7 +221,7 @@ * .. Array Arguments .. INTEGER COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ), $ INDXQ( * ) - REAL D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), + REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), $ W( * ), Z( * ) * .. * @@ -300,9 +300,9 @@ * re-integrate the deflated parts from the last pass * DO 20 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) 20 CONTINUE - CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDXC ) + CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC ) DO 30 I = 1, N INDX( I ) = INDXQ( INDXC( I ) ) 30 CONTINUE @@ -324,11 +324,11 @@ DO 40 J = 1, N I = INDX( J ) CALL SCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 ) - DLAMDA( J ) = D( I ) + DLAMBDA( J ) = D( I ) IQ2 = IQ2 + N 40 CONTINUE CALL SLACPY( 'A', N, N, Q2, N, Q, LDQ ) - CALL SCOPY( N, DLAMDA, 1, D, 1 ) + CALL SCOPY( N, DLAMBDA, 1, D, 1 ) GO TO 190 END IF * @@ -421,7 +421,7 @@ PJ = NJ ELSE K = K + 1 - DLAMDA( K ) = D( PJ ) + DLAMBDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ PJ = NJ @@ -433,7 +433,7 @@ * Record the last eigenvalue. * K = K + 1 - DLAMDA( K ) = D( PJ ) + DLAMBDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ * @@ -470,9 +470,9 @@ PSM( CT ) = PSM( CT ) + 1 130 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * I = 1 diff --git a/lapack-netlib/SRC/slaed3.f b/lapack-netlib/SRC/slaed3.f index e84f22be1..44c601f91 100644 --- a/lapack-netlib/SRC/slaed3.f +++ b/lapack-netlib/SRC/slaed3.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX, +* SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX, * CTOT, W, S, INFO ) * * .. Scalar Arguments .. @@ -27,7 +27,7 @@ * .. * .. Array Arguments .. * INTEGER CTOT( * ), INDX( * ) -* REAL D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), +* REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), * $ S( * ), W( * ) * .. * @@ -44,12 +44,6 @@ *> being combined by the matrix of eigenvectors of the K-by-K system *> which is solved here. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -104,14 +98,12 @@ *> RHO >= 0 required. *> \endverbatim *> -*> \param[in,out] DLAMDA +*> \param[in] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (K) +*> DLAMBDA is REAL array, dimension (K) *> The first K elements of this array contain the old roots *> of the deflated updating problem. These are the poles -*> of the secular equation. May be changed on output by -*> having lowest order bit set to zero on Cray X-MP, Cray Y-MP, -*> Cray-2, or Cray C-90, as described above. +*> of the secular equation. *> \endverbatim *> *> \param[in] Q2 @@ -180,7 +172,7 @@ *> Modified by Francoise Tisseur, University of Tennessee *> * ===================================================================== - SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX, + SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX, $ CTOT, W, S, INFO ) * * -- LAPACK computational routine -- @@ -193,7 +185,7 @@ * .. * .. Array Arguments .. INTEGER CTOT( * ), INDX( * ) - REAL D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), + REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), $ S( * ), W( * ) * .. * @@ -208,8 +200,8 @@ REAL TEMP * .. * .. External Functions .. - REAL SLAMC3, SNRM2 - EXTERNAL SLAMC3, SNRM2 + REAL SNRM2 + EXTERNAL SNRM2 * .. * .. External Subroutines .. EXTERNAL SCOPY, SGEMM, SLACPY, SLAED4, SLASET, XERBLA @@ -239,30 +231,9 @@ * IF( K.EQ.0 ) $ RETURN -* -* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), -* which on any of these machines zeros out the bottommost -* bit of DLAMDA(I) if it is 1; this makes the subsequent -* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DLAMDA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DLAMDA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, K - DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) - 10 CONTINUE * DO 20 J = 1, K - CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO ) + CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO ) * * If the zero finder fails, the computation is terminated. * @@ -293,10 +264,10 @@ CALL SCOPY( K, Q, LDQ+1, W, 1 ) DO 60 J = 1, K DO 40 I = 1, J - 1 - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 40 CONTINUE DO 50 I = J + 1, K - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 50 CONTINUE 60 CONTINUE DO 70 I = 1, K diff --git a/lapack-netlib/SRC/slaed8.f b/lapack-netlib/SRC/slaed8.f index 9c8ba440c..9dd8a15f6 100644 --- a/lapack-netlib/SRC/slaed8.f +++ b/lapack-netlib/SRC/slaed8.f @@ -19,7 +19,7 @@ * =========== * * SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO, -* CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR, +* CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR, * GIVCOL, GIVNUM, INDXP, INDX, INFO ) * * .. Scalar Arguments .. @@ -30,7 +30,7 @@ * .. Array Arguments .. * INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), * $ INDXQ( * ), PERM( * ) -* REAL D( * ), DLAMDA( * ), GIVNUM( 2, * ), +* REAL D( * ), DLAMBDA( * ), GIVNUM( 2, * ), * $ Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * ) * .. * @@ -141,9 +141,9 @@ *> process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (N) +*> DLAMBDA is REAL array, dimension (N) *> A copy of the first K eigenvalues which will be used by *> SLAED3 to form the secular equation. *> \endverbatim @@ -238,7 +238,7 @@ * * ===================================================================== SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO, - $ CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR, + $ CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR, $ GIVCOL, GIVNUM, INDXP, INDX, INFO ) * * -- LAPACK computational routine -- @@ -253,7 +253,7 @@ * .. Array Arguments .. INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), $ INDXQ( * ), PERM( * ) - REAL D( * ), DLAMDA( * ), GIVNUM( 2, * ), + REAL D( * ), DLAMBDA( * ), GIVNUM( 2, * ), $ Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * ) * .. * @@ -339,14 +339,14 @@ INDXQ( I ) = INDXQ( I ) + CUTPNT 20 CONTINUE DO 30 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) W( I ) = Z( INDXQ( I ) ) 30 CONTINUE I = 1 J = CUTPNT + 1 - CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX ) + CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX ) DO 40 I = 1, N - D( I ) = DLAMDA( INDX( I ) ) + D( I ) = DLAMBDA( INDX( I ) ) Z( I ) = W( INDX( I ) ) 40 CONTINUE * @@ -464,7 +464,7 @@ ELSE K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM JLAM = J END IF @@ -476,26 +476,26 @@ * K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM * 110 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * IF( ICOMPQ.EQ.0 ) THEN DO 120 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) 120 CONTINUE ELSE DO 130 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) CALL SCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 ) 130 CONTINUE @@ -506,9 +506,9 @@ * IF( K.LT.N ) THEN IF( ICOMPQ.EQ.0 ) THEN - CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) ELSE - CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) CALL SLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, $ Q( 1, K+1 ), LDQ ) END IF diff --git a/lapack-netlib/SRC/slaed9.f b/lapack-netlib/SRC/slaed9.f index 4d07416e9..d1b7b29fd 100644 --- a/lapack-netlib/SRC/slaed9.f +++ b/lapack-netlib/SRC/slaed9.f @@ -18,15 +18,15 @@ * Definition: * =========== * -* SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W, -* S, LDS, INFO ) +* SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA, +* W, S, LDS, INFO ) * * .. Scalar Arguments .. * INTEGER INFO, K, KSTART, KSTOP, LDQ, LDS, N * REAL RHO * .. * .. Array Arguments .. -* REAL D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ), +* REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ), * $ W( * ) * .. * @@ -96,9 +96,9 @@ *> RHO >= 0 required. *> \endverbatim *> -*> \param[in] DLAMDA +*> \param[in] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (K) +*> DLAMBDA is REAL array, dimension (K) *> The first K elements of this array contain the old roots *> of the deflated updating problem. These are the poles *> of the secular equation. @@ -151,8 +151,8 @@ *> at Berkeley, USA * * ===================================================================== - SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W, - $ S, LDS, INFO ) + SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA, + $ W, S, LDS, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -163,7 +163,7 @@ REAL RHO * .. * .. Array Arguments .. - REAL D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ), + REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ), $ W( * ) * .. * @@ -174,8 +174,8 @@ REAL TEMP * .. * .. External Functions .. - REAL SLAMC3, SNRM2 - EXTERNAL SLAMC3, SNRM2 + REAL SNRM2 + EXTERNAL SNRM2 * .. * .. External Subroutines .. EXTERNAL SCOPY, SLAED4, XERBLA @@ -212,30 +212,9 @@ * IF( K.EQ.0 ) $ RETURN -* -* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), -* which on any of these machines zeros out the bottommost -* bit of DLAMDA(I) if it is 1; this makes the subsequent -* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DLAMDA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DLAMDA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, N - DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) - 10 CONTINUE * DO 20 J = KSTART, KSTOP - CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO ) + CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO ) * * If the zero finder fails, the computation is terminated. * @@ -261,10 +240,10 @@ CALL SCOPY( K, Q, LDQ+1, W, 1 ) DO 70 J = 1, K DO 50 I = 1, J - 1 - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 50 CONTINUE DO 60 I = J + 1, K - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 60 CONTINUE 70 CONTINUE DO 80 I = 1, K diff --git a/lapack-netlib/SRC/slals0.f b/lapack-netlib/SRC/slals0.f index 7d44e2864..f168dc653 100644 --- a/lapack-netlib/SRC/slals0.f +++ b/lapack-netlib/SRC/slals0.f @@ -389,6 +389,11 @@ $ ( POLES( I, 2 ).EQ.ZERO ) ) THEN WORK( I ) = ZERO ELSE +* +* Use calls to the subroutine SLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* WORK( I ) = POLES( I, 2 )*Z( I ) / $ ( SLAMC3( POLES( I, 2 ), DSIGJ )- $ DIFLJ ) / ( POLES( I, 2 )+DJ ) @@ -440,6 +445,11 @@ IF( Z( J ).EQ.ZERO ) THEN WORK( I ) = ZERO ELSE +* +* Use calls to the subroutine SLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* WORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1, $ 2 ) )-DIFR( I, 1 ) ) / $ ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 ) diff --git a/lapack-netlib/SRC/slalsa.f b/lapack-netlib/SRC/slalsa.f index 465455e4a..95becc76e 100644 --- a/lapack-netlib/SRC/slalsa.f +++ b/lapack-netlib/SRC/slalsa.f @@ -43,9 +43,9 @@ *> *> \verbatim *> -*> SLALSA is an itermediate step in solving the least squares problem +*> SLALSA is an intermediate step in solving the least squares problem *> by computing the SVD of the coefficient matrix in compact form (The -*> singular vectors are computed as products of simple orthorgonal +*> singular vectors are computed as products of simple orthogonal *> matrices.). *> *> If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector diff --git a/lapack-netlib/SRC/slalsd.f b/lapack-netlib/SRC/slalsd.f index 2197f728e..9943a52d9 100644 --- a/lapack-netlib/SRC/slalsd.f +++ b/lapack-netlib/SRC/slalsd.f @@ -47,12 +47,6 @@ *> problem; in this case a minimum norm solution is returned. *> The actual singular values are returned in D in ascending order. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f index d4996b1f2..432afaded 100644 --- a/lapack-netlib/SRC/slamswlq.f +++ b/lapack-netlib/SRC/slamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,33 +192,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE SLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL STPMLQT, SGEMLQT, XERBLA * .. @@ -223,52 +231,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +418,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMSWLQ diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f index 960b794de..f9b167aea 100644 --- a/lapack-netlib/SRC/slamtsqr.f +++ b/lapack-netlib/SRC/slamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,33 +193,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE SLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SGEMQRT, STPMQRT, XERBLA * .. @@ -225,12 +232,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +246,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +267,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN. AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL SGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +424,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMTSQR diff --git a/lapack-netlib/SRC/slaqp2rk.c b/lapack-netlib/SRC/slaqp2rk.c new file mode 100644 index 000000000..0bfa71ab9 --- /dev/null +++ b/lapack-netlib/SRC/slaqp2rk.c @@ -0,0 +1,918 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = ZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + slarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + tau[kk] = 0.f; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since SLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by SLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by SLARFG is covered by checking TAU(KK) for NaN. */ + + if (sisnan_(&tau[kk])) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[kk]; + *relmaxc2nrmk = tau[kk]; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + aikk = a[i__ + kk * a_dim1]; + a[i__ + kk * a_dim1] = 1.f; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + slarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[ + kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + a[i__ + kk * a_dim1] = aikk; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + r__2 = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j]; + temp = 1.f - r__2 * r__2; + temp = f2cmax(temp,0.f); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = snrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + return 0; + +/* End of SLAQP2RK */ + +} /* slaqp2rk_ */ + diff --git a/lapack-netlib/SRC/slaqp2rk.f b/lapack-netlib/SRC/slaqp2rk.f new file mode 100644 index 000000000..f88b0ce90 --- /dev/null +++ b/lapack-netlib/SRC/slaqp2rk.f @@ -0,0 +1,713 @@ +*> \brief \b SLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* REAL A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), +* $ WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of a real matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine SGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine SGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (N-1) +*> Used in SLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + REAL A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), + $ WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + REAL AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL SLARF, SLARFG, SSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL SSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = ZERO. +* + IF( I.LT.M ) THEN + CALL SLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = ZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since SLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by SLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by SLARFG is covered by checking TAU(KK) for NaN. +* + IF( SISNAN( TAU(KK) ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( KK ) + RELMAXC2NRMK = TAU( KK ) +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = ONE + CALL SLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = SNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = ZERO + END DO +* + RETURN +* +* End of SLAQP2RK +* + END diff --git a/lapack-netlib/SRC/slaqp3rk.c b/lapack-netlib/SRC/slaqp3rk.c new file mode 100644 index 000000000..e3632538b --- /dev/null +++ b/lapack-netlib/SRC/slaqp3rk.c @@ -0,0 +1,1109 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.f) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.f; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + sswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1); + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + slarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + tau[k] = 0.f; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since SLARFG cannot produce TAU(K) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by SLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by SLARFG is covered by checking TAU(K) for NaN. */ + + if (sisnan_(&tau[k])) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[k]; + *relmaxc2nrmk = tau[k]; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, & + a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, & + c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + aik = a[i__ + k * a_dim1]; + a[i__ + k * a_dim1] = 1.f; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + sgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + f[j + k * f_dim1] = 0.f; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + r__1 = -tau[k]; + sgemv_("Transpose", &i__1, &i__2, &r__1, &a[i__ + a_dim1], lda, & + a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + sgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, + &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], + lda); + } + + a[i__ + k * a_dim1] = aik; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j]; +/* Computing MAX */ + r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp); + temp = f2cmax(r__1,r__2); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ + + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ + + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* SNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(SLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = snrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of SLAQP3RK */ + +} /* slaqp3rk_ */ + diff --git a/lapack-netlib/SRC/slaqp3rk.f b/lapack-netlib/SRC/slaqp3rk.f new file mode 100644 index 000000000..b2dc2b334 --- /dev/null +++ b/lapack-netlib/SRC/slaqp3rk.f @@ -0,0 +1,935 @@ +*> \brief \b SLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* +* .. Scalar Arguments .. +* LOGICAL DONE +* INTEGER KB, LDA, LDF, M, N, NB, NRHS, IOFFSET +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), +* $ VN1( * ), VN2( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine SGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine SGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is REAL array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is REAL array, dimension (LDF,NB) +*> Matrix F**T = L*(Y**T)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), + $ VN1( * ), VN2( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + REAL AIK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGEMV, SLARFG, SSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL SSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL SSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. +* + IF( K.GT.1 ) THEN + CALL SGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 ) + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL SLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = ZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since SLARFG cannot produce TAU(K) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by SLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by SLARFG is covered by checking TAU(K) for NaN. +* + IF( SISNAN( TAU(K) ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( K ) + RELMAXC2NRMK = TAU( K ) +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = ONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL SGEMV( 'Transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ ZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = ZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL SGEMV( 'Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, ZERO, + $ AUXV( 1 ), 1 ) +* + CALL SGEMV( 'No transpose', N+NRHS, K-1, ONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. +* + IF( K.LT.N+NRHS ) THEN + CALL SGEMV( 'No transpose', N+NRHS-K, K, -ONE, + $ F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE, + $ A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* SNRM2 does not fail on vectors with norm below the value of +* SQRT(SLAMCH('S')) +* + VN1( LSTICC ) = SNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of SLAQP3RK +* + END diff --git a/lapack-netlib/SRC/slaqr2.f b/lapack-netlib/SRC/slaqr2.f index 62c4ef5eb..caf79fd1c 100644 --- a/lapack-netlib/SRC/slaqr2.f +++ b/lapack-netlib/SRC/slaqr2.f @@ -263,7 +263,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup laqr2 * *> \par Contributors: * ================== @@ -305,11 +305,11 @@ LOGICAL BULGE, SORTED * .. * .. External Functions .. - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEHRD, SGEMM, SLABAD, SLACPY, SLAHQR, + EXTERNAL SCOPY, SGEHRD, SGEMM, SLACPY, SLAHQR, $ SLANV2, SLARF, SLARFG, SLASET, SORMHR, STREXC * .. * .. Intrinsic Functions .. @@ -343,7 +343,7 @@ * ==== Quick return in case of workspace query. ==== * IF( LWORK.EQ.-1 ) THEN - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -362,7 +362,6 @@ * SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N ) / ULP ) * @@ -674,7 +673,7 @@ * * ==== Return optimal workspace. ==== * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * * ==== End of SLAQR2 ==== * diff --git a/lapack-netlib/SRC/slaqr3.f b/lapack-netlib/SRC/slaqr3.f index 519ccd6ed..d3ffb0f96 100644 --- a/lapack-netlib/SRC/slaqr3.f +++ b/lapack-netlib/SRC/slaqr3.f @@ -260,7 +260,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup laqr3 * *> \par Contributors: * ================== @@ -302,14 +302,13 @@ LOGICAL BULGE, SORTED * .. * .. External Functions .. - REAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK INTEGER ILAENV - EXTERNAL SLAMCH, ILAENV + EXTERNAL SLAMCH, SROUNDUP_LWORK, ILAENV * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEHRD, SGEMM, SLABAD, SLACPY, SLAHQR, - $ SLANV2, SLAQR4, SLARF, SLARFG, SLASET, SORMHR, - $ STREXC + EXTERNAL SCOPY, SGEHRD, SGEMM, SLACPY, SLAHQR, SLANV2, + $ SLAQR4, SLARF, SLARFG, SLASET, SORMHR, STREXC * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, MIN, REAL, SQRT @@ -348,7 +347,7 @@ * ==== Quick return in case of workspace query. ==== * IF( LWORK.EQ.-1 ) THEN - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -367,7 +366,6 @@ * SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N ) / ULP ) * @@ -685,7 +683,7 @@ * * ==== Return optimal workspace. ==== * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * * ==== End of SLAQR3 ==== * diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f index 1f0a51c85..d6721df97 100644 --- a/lapack-netlib/SRC/slaqr4.f +++ b/lapack-netlib/SRC/slaqr4.f @@ -239,7 +239,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup laqr4 * *> \par Contributors: * ================== @@ -316,7 +316,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Local Arrays .. REAL ZDUM( 1, 1 ) @@ -325,7 +326,7 @@ EXTERNAL SLACPY, SLAHQR, SLANV2, SLAQR2, SLAQR5 * .. * .. Intrinsic Functions .. - INTRINSIC ABS, INT, MAX, MIN, MOD, REAL + INTRINSIC ABS, INT, MAX, MIN, MOD * .. * .. Executable Statements .. INFO = 0 @@ -401,7 +402,7 @@ * ==== Quick return in case of workspace query. ==== * IF( LWORK.EQ.-1 ) THEN - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -732,7 +733,7 @@ * * ==== Return the optimal value of LWORK. ==== * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * * ==== End of SLAQR4 ==== * diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f index 2e06f9d42..c128093e4 100644 --- a/lapack-netlib/SRC/slaqz0.f +++ b/lapack-netlib/SRC/slaqz0.f @@ -100,7 +100,7 @@ *> Anal., 29(2006), pp. 199--227. *> *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift, -*> multipole rational QZ method with agressive early deflation" +*> multipole rational QZ method with aggressive early deflation" *> \endverbatim * * Arguments: @@ -294,7 +294,7 @@ * *> \date May 2020 * -*> \ingroup doubleGEcomputational +*> \ingroup laqz0 *> * ===================================================================== RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, @@ -329,9 +329,9 @@ CHARACTER :: JBCMPZ*3 * External Functions - EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD, + EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, $ SLARTG, SROT - REAL, EXTERNAL :: SLAMCH, SLANHS + REAL, EXTERNAL :: SLAMCH, SLANHS, SROUNDUP_LWORK LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -461,7 +461,7 @@ LWORKREQ = MAX( ITEMP1+2*NW**2, ITEMP2+2*NBR**2 ) IF ( LWORK .EQ.-1 ) THEN - WORK( 1 ) = REAL( LWORKREQ ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKREQ ) RETURN ELSE IF ( LWORK .LT. LWORKREQ ) THEN INFO = -19 @@ -479,7 +479,6 @@ * Get machine constants SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) @@ -564,7 +563,7 @@ DO WHILE ( K.GE.ISTART2 ) IF( ABS( B( K, K ) ) .LT. BTOL ) THEN -* A diagonal element of B is negligable, move it +* A diagonal element of B is negligible, move it * to the top and deflate it DO K2 = K, ISTART2+1, -1 diff --git a/lapack-netlib/SRC/slaqz3.f b/lapack-netlib/SRC/slaqz3.f index edb8a6012..979381364 100644 --- a/lapack-netlib/SRC/slaqz3.f +++ b/lapack-netlib/SRC/slaqz3.f @@ -228,7 +228,7 @@ * *> \date May 2020 * -*> \ingroup doubleGEcomputational +*> \ingroup laqz3 *> * ===================================================================== RECURSIVE SUBROUTINE SLAQZ3( ILSCHUR, ILQ, ILZ, N, ILO, IHI, NW, @@ -258,9 +258,9 @@ REAL :: S, SMLNUM, ULP, SAFMIN, SAFMAX, C1, S1, TEMP * External Functions - EXTERNAL :: XERBLA, STGEXC, SLABAD, SLAQZ0, SLACPY, SLASET, + EXTERNAL :: XERBLA, STGEXC, SLAQZ0, SLACPY, SLASET, $ SLAQZ2, SROT, SLARTG, SLAG2, SGEMM - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, SROUNDUP_LWORK INFO = 0 @@ -286,7 +286,7 @@ LWORKREQ = MAX( LWORKREQ, N*NW, 2*NW**2+N ) IF ( LWORK .EQ.-1 ) THEN * workspace query, quick return - WORK( 1 ) = LWORKREQ + WORK( 1 ) = SROUNDUP_LWORK(LWORKREQ) RETURN ELSE IF ( LWORK .LT. LWORKREQ ) THEN INFO = -26 @@ -300,7 +300,6 @@ * Get machine constants SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) diff --git a/lapack-netlib/SRC/slaqz4.f b/lapack-netlib/SRC/slaqz4.f index 3c307dd47..95b2784c5 100644 --- a/lapack-netlib/SRC/slaqz4.f +++ b/lapack-netlib/SRC/slaqz4.f @@ -204,7 +204,7 @@ * *> \date May 2020 * -*> \ingroup doubleGEcomputational +*> \ingroup laqz4 *> * ===================================================================== SUBROUTINE SLAQZ4( ILSCHUR, ILQ, ILZ, N, ILO, IHI, NSHIFTS, @@ -236,6 +236,7 @@ * External functions EXTERNAL :: XERBLA, SGEMM, SLAQZ1, SLAQZ2, SLASET, SLARTG, SROT, $ SLACPY + REAL, EXTERNAL :: SROUNDUP_LWORK INFO = 0 IF ( NBLOCK_DESIRED .LT. NSHIFTS+1 ) THEN @@ -243,7 +244,7 @@ END IF IF ( LWORK .EQ.-1 ) THEN * workspace query, quick return - WORK( 1 ) = N*NBLOCK_DESIRED + WORK( 1 ) = SROUNDUP_LWORK(N*NBLOCK_DESIRED) RETURN ELSE IF ( LWORK .LT. N*NBLOCK_DESIRED ) THEN INFO = -25 diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f index 7719f2965..f1fdef790 100644 --- a/lapack-netlib/SRC/slarfb_gett.f +++ b/lapack-netlib/SRC/slarfb_gett.f @@ -451,7 +451,7 @@ IF( LNOTIDENT ) THEN * * col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, -* V1 is not an identy matrix, but unit lower-triangular +* V1 is not an identity matrix, but unit lower-triangular * V1 stored in A1 (diagonal ones are not stored). * * diff --git a/lapack-netlib/SRC/slarfgp.f b/lapack-netlib/SRC/slarfgp.f index df42980c4..c28274c2c 100644 --- a/lapack-netlib/SRC/slarfgp.f +++ b/lapack-netlib/SRC/slarfgp.f @@ -97,7 +97,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup larfgp * * ===================================================================== SUBROUTINE SLARFGP( N, ALPHA, X, INCX, TAU ) @@ -122,7 +122,7 @@ * .. * .. Local Scalars .. INTEGER J, KNT - REAL BETA, BIGNUM, SAVEALPHA, SMLNUM, XNORM + REAL BETA, BIGNUM, EPS, SAVEALPHA, SMLNUM, XNORM * .. * .. External Functions .. REAL SLAMCH, SLAPY2, SNRM2 @@ -141,9 +141,10 @@ RETURN END IF * + EPS = SLAMCH( 'Precision' ) XNORM = SNRM2( N-1, X, INCX ) * - IF( XNORM.EQ.ZERO ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN * * H = [+/-1, 0; I], sign chosen so ALPHA >= 0. * diff --git a/lapack-netlib/SRC/slarre.f b/lapack-netlib/SRC/slarre.f index 2e34ca5a6..e1b52c385 100644 --- a/lapack-netlib/SRC/slarre.f +++ b/lapack-netlib/SRC/slarre.f @@ -51,7 +51,7 @@ *> SSTEMR to compute the eigenvectors of T. *> The accuracy varies depending on whether bisection is used to *> find a few eigenvalues or the dqds algorithm (subroutine SLASQ2) to -*> conpute all and then discard any unwanted one. +*> compute all and then discard any unwanted one. *> As an added benefit, SLARRE also outputs the n *> Gerschgorin intervals for the matrices L_i D_i L_i^T. *> \endverbatim diff --git a/lapack-netlib/SRC/slaruv.f b/lapack-netlib/SRC/slaruv.f index c25dc2e2f..cd37a1c47 100644 --- a/lapack-netlib/SRC/slaruv.f +++ b/lapack-netlib/SRC/slaruv.f @@ -382,6 +382,11 @@ $ 1537 / * .. * .. Executable Statements .. +* +* Quick return for N < 1 + IF ( N < 1 ) THEN + RETURN + END IF * I1 = ISEED( 1 ) I2 = ISEED( 2 ) diff --git a/lapack-netlib/SRC/slas2.f b/lapack-netlib/SRC/slas2.f index 6ae86204c..02ecbf434 100644 --- a/lapack-netlib/SRC/slas2.f +++ b/lapack-netlib/SRC/slas2.f @@ -93,9 +93,7 @@ *> infinite. *> *> Overflow will not occur unless the largest singular value itself -*> overflows, or is within a few ulps of overflow. (On machines with -*> partial overflow, like the Cray, overflow may occur if the largest -*> singular value is within a factor of 2 of overflow.) +*> overflows, or is within a few ulps of overflow. *> *> Underflow is harmless if underflow is gradual. Otherwise, results *> may correspond to a matrix modified by perturbations of size near diff --git a/lapack-netlib/SRC/slasd0.f b/lapack-netlib/SRC/slasd0.f index a45f741a9..c8a5c7838 100644 --- a/lapack-netlib/SRC/slasd0.f +++ b/lapack-netlib/SRC/slasd0.f @@ -79,10 +79,11 @@ *> On exit, E has been destroyed. *> \endverbatim *> -*> \param[out] U +*> \param[in,out] U *> \verbatim *> U is REAL array, dimension (LDU, N) -*> On exit, U contains the left singular vectors. +*> On exit, U contains the left singular vectors, +*> if U passed in as (N, N) Identity. *> \endverbatim *> *> \param[in] LDU @@ -91,10 +92,11 @@ *> On entry, leading dimension of U. *> \endverbatim *> -*> \param[out] VT +*> \param[in,out] VT *> \verbatim *> VT is REAL array, dimension (LDVT, M) -*> On exit, VT**T contains the right singular vectors. +*> On exit, VT**T contains the right singular vectors, +*> if VT passed in as (M, M) Identity. *> \endverbatim *> *> \param[in] LDVT diff --git a/lapack-netlib/SRC/slasd3.f b/lapack-netlib/SRC/slasd3.f index f9420f88a..8f74743c2 100644 --- a/lapack-netlib/SRC/slasd3.f +++ b/lapack-netlib/SRC/slasd3.f @@ -44,13 +44,6 @@ *> appropriate calls to SLASD4 and then updates the singular *> vectors by matrix multiplication. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. -*> *> SLASD3 is called from SLASD1. *> \endverbatim * @@ -103,7 +96,7 @@ *> The leading dimension of the array Q. LDQ >= K. *> \endverbatim *> -*> \param[in,out] DSIGMA +*> \param[in] DSIGMA *> \verbatim *> DSIGMA is REAL array, dimension(K) *> The first K elements of this array contain the old roots @@ -249,8 +242,8 @@ REAL RHO, TEMP * .. * .. External Functions .. - REAL SLAMC3, SNRM2 - EXTERNAL SLAMC3, SNRM2 + REAL SNRM2 + EXTERNAL SNRM2 * .. * .. External Subroutines .. EXTERNAL SCOPY, SGEMM, SLACPY, SLASCL, SLASD4, XERBLA @@ -310,27 +303,6 @@ RETURN END IF * -* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I), -* which on any of these machines zeros out the bottommost -* bit of DSIGMA(I) if it is 1; this makes the subsequent -* subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DSIGMA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DSIGMA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DSIGMA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 20 I = 1, K - DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I ) - 20 CONTINUE -* * Keep a copy of Z. * CALL SCOPY( K, Z, 1, Q, 1 ) diff --git a/lapack-netlib/SRC/slasd8.f b/lapack-netlib/SRC/slasd8.f index 43b171e5f..df5002367 100644 --- a/lapack-netlib/SRC/slasd8.f +++ b/lapack-netlib/SRC/slasd8.f @@ -121,14 +121,12 @@ *> The leading dimension of DIFR, must be at least K. *> \endverbatim *> -*> \param[in,out] DSIGMA +*> \param[in] DSIGMA *> \verbatim *> DSIGMA is REAL array, dimension ( K ) *> On entry, the first K elements of this array contain the old *> roots of the deflated updating problem. These are the poles *> of the secular equation. -*> On exit, the elements of DSIGMA may be very slightly altered -*> in value. *> \endverbatim *> *> \param[out] WORK @@ -227,27 +225,6 @@ RETURN END IF * -* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I), -* which on any of these machines zeros out the bottommost -* bit of DSIGMA(I) if it is 1; this makes the subsequent -* subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DSIGMA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DSIGMA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, K - DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I ) - 10 CONTINUE -* * Book keeping. * IWK1 = 1 @@ -312,6 +289,11 @@ DSIGJP = -DSIGMA( J+1 ) END IF WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ ) +* +* Use calls to the subroutine SLAMC3 to enforce the parentheses +* (x+y)+z. The goal is to prevent optimizing compilers +* from doing x+(y+z). +* DO 60 I = 1, J - 1 WORK( I ) = Z( I ) / ( SLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ ) $ / ( DSIGMA( I )+DJ ) diff --git a/lapack-netlib/SRC/slassq.f90 b/lapack-netlib/SRC/slassq.f90 index 19f49402b..c8959f4a7 100644 --- a/lapack-netlib/SRC/slassq.f90 +++ b/lapack-netlib/SRC/slassq.f90 @@ -34,28 +34,15 @@ !> !> \verbatim !> -!> SLASSQ returns the values scl and smsq such that +!> SLASSQ returns the values scale_out and sumsq_out such that !> -!> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, +!> (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq, !> -!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is +!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is !> assumed to be non-negative. !> !> scale and sumsq must be supplied in SCALE and SUMSQ and -!> scl and smsq are overwritten on SCALE and SUMSQ respectively. -!> -!> If scale * sqrt( sumsq ) > tbig then -!> we require: scale >= sqrt( TINY*EPS ) / sbig on entry, -!> and if 0 < scale * sqrt( sumsq ) < tsml then -!> we require: scale <= sqrt( HUGE ) / ssml on entry, -!> where -!> tbig -- upper threshold for values whose square is representable; -!> sbig -- scaling constant for big numbers; \see la_constants.f90 -!> tsml -- lower threshold for values whose square is representable; -!> ssml -- scaling constant for small numbers; \see la_constants.f90 -!> and -!> TINY*EPS -- tiniest representable number; -!> HUGE -- biggest representable number. +!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively. !> !> \endverbatim ! @@ -72,7 +59,7 @@ !> \verbatim !> X is REAL array, dimension (1+(N-1)*abs(INCX)) !> The vector for which a scaled sum of squares is computed. -!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. +!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. !> \endverbatim !> !> \param[in] INCX @@ -82,24 +69,24 @@ !> If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n !> If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n !> If INCX = 0, x isn't a vector so there is no need to call -!> this subroutine. If you call it anyway, it will count x(1) +!> this subroutine. If you call it anyway, it will count x(1) !> in the vector norm N times. !> \endverbatim !> !> \param[in,out] SCALE !> \verbatim !> SCALE is REAL -!> On entry, the value scale in the equation above. -!> On exit, SCALE is overwritten with scl , the scaling factor +!> On entry, the value scale in the equation above. +!> On exit, SCALE is overwritten by scale_out, the scaling factor !> for the sum of squares. !> \endverbatim !> !> \param[in,out] SUMSQ !> \verbatim !> SUMSQ is REAL -!> On entry, the value sumsq in the equation above. -!> On exit, SUMSQ is overwritten with smsq , the basic sum of -!> squares from which scl has been factored out. +!> On entry, the value sumsq in the equation above. +!> On exit, SUMSQ is overwritten by sumsq_out, the basic sum of +!> squares from which scale_out has been factored out. !> \endverbatim ! ! Authors: @@ -130,10 +117,10 @@ !> !> \endverbatim ! -!> \ingroup OTHERauxiliary +!> \ingroup lassq ! ! ===================================================================== -subroutine SLASSQ( n, x, incx, scl, sumsq ) +subroutine SLASSQ( n, x, incx, scale, sumsq ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, one=>sone, & sbig=>ssbig, ssml=>sssml, tbig=>stbig, tsml=>stsml @@ -145,7 +132,7 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) ! ! .. Scalar Arguments .. integer :: incx, n - real(wp) :: scl, sumsq + real(wp) :: scale, sumsq ! .. ! .. Array Arguments .. real(wp) :: x(*) @@ -158,10 +145,10 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) ! ! Quick return if possible ! - if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return - if( sumsq == zero ) scl = one - if( scl == zero ) then - scl = one + if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return + if( sumsq == zero ) scale = one + if( scale == zero ) then + scale = one sumsq = zero end if if (n <= 0) then @@ -198,15 +185,27 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) ! Put the existing sum of squares into one of the accumulators ! if( sumsq > zero ) then - ax = scl*sqrt( sumsq ) + ax = scale*sqrt( sumsq ) if (ax > tbig) then -! We assume scl >= sqrt( TINY*EPS ) / sbig - abig = abig + (scl*sbig)**2 * sumsq + if (scale > one) then + scale = scale * sbig + abig = abig + scale * (scale * sumsq) + else + ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable + abig = abig + scale * (scale * (sbig * (sbig * sumsq))) + end if else if (ax < tsml) then -! We assume scl <= sqrt( HUGE ) / ssml - if (notbig) asml = asml + (scl*ssml)**2 * sumsq + if (notbig) then + if (scale < one) then + scale = scale * ssml + asml = asml + scale * (scale * sumsq) + else + ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable + asml = asml + scale * (scale * (ssml * (ssml * sumsq))) + end if + end if else - amed = amed + scl**2 * sumsq + amed = amed + scale * (scale * sumsq) end if end if ! @@ -220,7 +219,7 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) if (amed > zero .or. LA_ISNAN(amed)) then abig = abig + (amed*sbig)*sbig end if - scl = one / sbig + scale = one / sbig sumsq = abig else if (asml > zero) then ! @@ -236,17 +235,17 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) ymin = asml ymax = amed end if - scl = one + scale = one sumsq = ymax**2*( one + (ymin/ymax)**2 ) else - scl = one / ssml + scale = one / ssml sumsq = asml end if else ! ! Otherwise all values are mid-range or zero ! - scl = one + scale = one sumsq = amed end if return diff --git a/lapack-netlib/SRC/slasv2.f b/lapack-netlib/SRC/slasv2.f index 6b98e9a0c..bf5d3ea0e 100644 --- a/lapack-netlib/SRC/slasv2.f +++ b/lapack-netlib/SRC/slasv2.f @@ -124,9 +124,7 @@ *> infinite. *> *> Overflow will not occur unless the largest singular value itself -*> overflows or is within a few ulps of overflow. (On machines with -*> partial overflow, like the Cray, overflow may occur if the largest -*> singular value is within a factor of 2 of overflow.) +*> overflows or is within a few ulps of overflow. *> *> Underflow is harmless if underflow is gradual. Otherwise, results *> may correspond to a matrix modified by perturbations of size near diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f index bc7cf343d..594c646db 100644 --- a/lapack-netlib/SRC/slaswlq.f +++ b/lapack-netlib/SRC/slaswlq.f @@ -96,21 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim -*> The dimension of the array WORK. LWORK >= MB * M. +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim + *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -158,33 +161,39 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T( LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -195,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -208,60 +224,60 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL STPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLASWLQ diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f index 0761d656f..9765ea3d7 100644 --- a/lapack-netlib/SRC/slatrs.f +++ b/lapack-netlib/SRC/slatrs.f @@ -261,6 +261,9 @@ REAL BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS, $ TMAX, TSCAL, USCAL, XBND, XJ, XMAX * .. +* .. Local Arrays .. + REAL WORK (1) +* .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX @@ -362,7 +365,7 @@ * A is upper triangular. * DO J = 2, N - TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ), $ TMAX ) END DO ELSE @@ -371,7 +374,7 @@ * DO J = 1, N - 1 TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1, - $ SUMJ ), TMAX ) + $ WORK ), TMAX ) END DO END IF * diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f index c3a08e524..17052289e 100644 --- a/lapack-netlib/SRC/slatrs3.f +++ b/lapack-netlib/SRC/slatrs3.f @@ -151,13 +151,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). -*> On exit, if INFO = 0, WORK(1) returns the optimal size of -*> WORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +168,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +185,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +257,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -264,7 +268,8 @@ EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM * .. * .. External Subroutines .. - EXTERNAL SLATRS, SSCAL, XERBLA + REAL SROUNDUP_LWORK + EXTERNAL SLATRS, SSCAL, SROUNDUP_LWORK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -292,15 +297,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -574,7 +588,7 @@ * Prepare the linear update to be executed with GEMM. * For each column, compute a consistent scaling, a * scaling factor to survive the linear update, and -* rescale the column segments, if necesssary. Then +* rescale the column segments, if necessary. Then * the linear update is safely executed. * DO KK = 1, K2-K1 @@ -650,6 +664,8 @@ END DO END DO RETURN +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * End of SLATRS3 * diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f index 33966c01f..4730815b5 100644 --- a/lapack-netlib/SRC/slatsqr.f +++ b/lapack-netlib/SRC/slatsqr.f @@ -101,14 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> The dimension of the array WORK. LWORK >= NB*N. +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -160,33 +164,39 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T(LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGEQRT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -197,6 +207,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -204,64 +221,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLATSQR diff --git a/lapack-netlib/SRC/sorbdb1.f b/lapack-netlib/SRC/sorbdb1.f index c860f4366..191e5742a 100644 --- a/lapack-netlib/SRC/sorbdb1.f +++ b/lapack-netlib/SRC/sorbdb1.f @@ -37,7 +37,7 @@ *>\verbatim *> *> SORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f index 484d352f8..b2ff34bb1 100644 --- a/lapack-netlib/SRC/sorbdb2.f +++ b/lapack-netlib/SRC/sorbdb2.f @@ -37,7 +37,7 @@ *>\verbatim *> *> SORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/sorbdb3.f b/lapack-netlib/SRC/sorbdb3.f index 6209b24ee..99478c5d0 100644 --- a/lapack-netlib/SRC/sorbdb3.f +++ b/lapack-netlib/SRC/sorbdb3.f @@ -37,7 +37,7 @@ *>\verbatim *> *> SORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f index bf60fb7bb..0fef5b759 100644 --- a/lapack-netlib/SRC/sorbdb4.f +++ b/lapack-netlib/SRC/sorbdb4.f @@ -38,7 +38,7 @@ *>\verbatim *> *> SORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/sorbdb5.f b/lapack-netlib/SRC/sorbdb5.f index 8c67aedfb..8fb88876f 100644 --- a/lapack-netlib/SRC/sorbdb5.f +++ b/lapack-netlib/SRC/sorbdb5.f @@ -148,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unbdb5 * * ===================================================================== SUBROUTINE SORBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -169,18 +169,21 @@ * ===================================================================== * * .. Parameters .. + REAL REALZERO + PARAMETER ( REALZERO = 0.0E0 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E0, ZERO = 0.0E0 ) * .. * .. Local Scalars .. INTEGER CHILDINFO, I, J + REAL EPS, NORM, SCL, SSQ * .. * .. External Subroutines .. - EXTERNAL SORBDB6, XERBLA + EXTERNAL SLASSQ, SORBDB6, SSCAL, XERBLA * .. * .. External Functions .. - REAL SNRM2 - EXTERNAL SNRM2 + REAL SLAMCH, SNRM2 + EXTERNAL SLAMCH, SNRM2 * .. * .. Intrinsic Function .. INTRINSIC MAX @@ -213,16 +216,33 @@ RETURN END IF * -* Project X onto the orthogonal complement of Q + EPS = SLAMCH( 'Precision' ) * - CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2, - $ WORK, LWORK, CHILDINFO ) +* Project X onto the orthogonal complement of Q if X is nonzero * -* If the projection is nonzero, then return + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) * - IF( SNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN - RETURN + IF( NORM .GT. N * EPS ) THEN +* Scale vector to unit norm to avoid problems in the caller code. +* Computing the reciprocal is undesirable but +* * xLASCL cannot be used because of the vector increments and +* * the round-off error has a negligible impact on +* orthogonalization. + CALL SSCAL( M1, ONE / NORM, X1, INCX1 ) + CALL SSCAL( M2, ONE / NORM, X2, INCX2 ) + CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, + $ LDQ2, WORK, LWORK, CHILDINFO ) +* +* If the projection is nonzero, then return +* + IF( SNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN + RETURN + END IF END IF * * Project each standard basis vector e_1,...,e_M1 in turn, stopping @@ -238,8 +258,8 @@ END DO CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( SNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( SNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO @@ -257,8 +277,8 @@ X2(I) = ONE CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( SNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( SNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f index b2449e3be..eac177722 100644 --- a/lapack-netlib/SRC/sorbdb6.f +++ b/lapack-netlib/SRC/sorbdb6.f @@ -41,9 +41,8 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The Euclidean norm of X must be one and the columns of Q must be -*> orthonormal. The orthogonalized vector will be zero if and only if it -*> lies entirely in the range of Q. +*> The columns of Q must be orthonormal. The orthogonalized vector will +*> be zero if and only if it lies entirely in the range of Q. *> *> The projection is computed with at most two iterations of the *> classical Gram-Schmidt algorithm, see @@ -152,7 +151,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unbdb6 * * ===================================================================== SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -174,7 +173,7 @@ * * .. Parameters .. REAL ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, + PARAMETER ( ALPHA = 0.83E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) REAL NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 ) @@ -222,14 +221,16 @@ * EPS = SLAMCH( 'Precision' ) * +* Compute the Euclidean norm of X +* + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) +* * First, project X onto the orthogonal complement of Q's column * space -* -* Christoph Conrads: In debugging mode the norm should be computed -* and an assertion added comparing the norm with one. Alas, Fortran -* never made it into 1989 when assert() was introduced into the C -* programming language. - NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f index b1a5c03a2..46f4ab130 100644 --- a/lapack-netlib/SRC/sorgbr.f +++ b/lapack-netlib/SRC/sorgbr.f @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGBcomputational +*> \ingroup ungbr * * ===================================================================== SUBROUTINE SORGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -179,7 +179,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORGLQ, SORGQR, XERBLA @@ -240,7 +241,7 @@ CALL XERBLA( 'SORGBR', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -326,7 +327,7 @@ END IF END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORGBR diff --git a/lapack-netlib/SRC/sorghr.f b/lapack-netlib/SRC/sorghr.f index f65cd898c..624ede282 100644 --- a/lapack-netlib/SRC/sorghr.f +++ b/lapack-netlib/SRC/sorghr.f @@ -119,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unghr * * ===================================================================== SUBROUTINE SORGHR( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO ) @@ -150,7 +150,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -177,7 +178,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'SORGQR', ' ', NH, NH, NH, -1 ) LWKOPT = MAX( 1, NH )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -229,7 +230,7 @@ CALL SORGQR( NH, NH, NH, A( ILO+1, ILO+1 ), LDA, TAU( ILO ), $ WORK, LWORK, IINFO ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORGHR diff --git a/lapack-netlib/SRC/sorglq.f b/lapack-netlib/SRC/sorglq.f index b1d107964..30f6d5d48 100644 --- a/lapack-netlib/SRC/sorglq.f +++ b/lapack-netlib/SRC/sorglq.f @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unglq * * ===================================================================== SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -155,7 +155,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -164,7 +165,7 @@ INFO = 0 NB = ILAENV( 1, 'SORGLQ', ' ', M, N, K, -1 ) LWKOPT = MAX( 1, M )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -278,7 +279,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SORGLQ diff --git a/lapack-netlib/SRC/sorgql.f b/lapack-netlib/SRC/sorgql.f index 34ab5edef..f104e64b2 100644 --- a/lapack-netlib/SRC/sorgql.f +++ b/lapack-netlib/SRC/sorgql.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ungql * * ===================================================================== SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,7 +182,7 @@ NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN INFO = -8 @@ -285,7 +286,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SORGQL diff --git a/lapack-netlib/SRC/sorgqr.f b/lapack-netlib/SRC/sorgqr.f index 056de54d7..a87ea6c65 100644 --- a/lapack-netlib/SRC/sorgqr.f +++ b/lapack-netlib/SRC/sorgqr.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ungqr * * ===================================================================== SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -165,7 +166,7 @@ INFO = 0 NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 ) LWKOPT = MAX( 1, N )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -279,7 +280,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SORGQR diff --git a/lapack-netlib/SRC/sorgrq.f b/lapack-netlib/SRC/sorgrq.f index d9b6ccbe6..331f20904 100644 --- a/lapack-netlib/SRC/sorgrq.f +++ b/lapack-netlib/SRC/sorgrq.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ungrq * * ===================================================================== SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,7 +182,7 @@ NB = ILAENV( 1, 'SORGRQ', ' ', M, N, K, -1 ) LWKOPT = M*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN INFO = -8 @@ -285,7 +286,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SORGRQ diff --git a/lapack-netlib/SRC/sorgtr.f b/lapack-netlib/SRC/sorgtr.f index 67bde00cb..6a1dc3034 100644 --- a/lapack-netlib/SRC/sorgtr.f +++ b/lapack-netlib/SRC/sorgtr.f @@ -116,7 +116,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ungtr * * ===================================================================== SUBROUTINE SORGTR( UPLO, N, A, LDA, TAU, WORK, LWORK, INFO ) @@ -146,7 +146,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORGQL, SORGQR, XERBLA @@ -178,7 +179,7 @@ NB = ILAENV( 1, 'SORGQR', ' ', N-1, N-1, N-1, -1 ) END IF LWKOPT = MAX( 1, N-1 )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -244,7 +245,7 @@ $ LWORK, IINFO ) END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORGTR diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f index a755fa4a4..0be27af77 100644 --- a/lapack-netlib/SRC/sorgtsqr.f +++ b/lapack-netlib/SRC/sorgtsqr.f @@ -133,6 +133,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= (M+NB)*N. *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK @@ -156,7 +157,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleOTHERcomputational +*> \ingroup ungtsqr * *> \par Contributors: * ================== @@ -195,11 +196,15 @@ LOGICAL LQUERY INTEGER IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SCOPY, SLAMTSQR, SLASET, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -256,14 +261,14 @@ CALL XERBLA( 'SORGTSQR', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -296,9 +301,9 @@ CALL SCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 ) END DO * - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of SORGTSQR * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f index d2a2150cd..5a1e1ff07 100644 --- a/lapack-netlib/SRC/sorgtsqr_row.f +++ b/lapack-netlib/SRC/sorgtsqr_row.f @@ -169,7 +169,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup sigleOTHERcomputational +*> \ingroup ungtsqr_row * *> \par Contributors: * ================== @@ -213,11 +213,15 @@ * .. Local Arrays .. REAL DUMMY( 1, 1 ) * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SLARFB_GETT, SLASET, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -255,14 +259,14 @@ CALL XERBLA( 'SORGTSQR_ROW', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -371,7 +375,7 @@ * END DO * - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of SORGTSQR_ROW diff --git a/lapack-netlib/SRC/sorm22.f b/lapack-netlib/SRC/sorm22.f index 15096870a..886adb2cf 100644 --- a/lapack-netlib/SRC/sorm22.f +++ b/lapack-netlib/SRC/sorm22.f @@ -155,7 +155,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unm22 * * ===================================================================== SUBROUTINE SORM22( SIDE, TRANS, M, N, N1, N2, Q, LDQ, C, LDC, @@ -187,13 +187,14 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, STRMM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -237,7 +238,7 @@ * IF( INFO.EQ.0 ) THEN LWKOPT = M*N - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -430,7 +431,7 @@ END IF END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SORM22 diff --git a/lapack-netlib/SRC/sormbr.f b/lapack-netlib/SRC/sormbr.f index efe5be41a..e2dccc363 100644 --- a/lapack-netlib/SRC/sormbr.f +++ b/lapack-netlib/SRC/sormbr.f @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmbr * * ===================================================================== SUBROUTINE SORMBR( VECT, SIDE, TRANS, M, N, K, A, LDA, TAU, C, @@ -217,7 +217,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORMLQ, SORMQR, XERBLA @@ -285,7 +286,7 @@ END IF END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -363,7 +364,7 @@ $ TAU, C( I1, I2 ), LDC, WORK, LWORK, IINFO ) END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMBR diff --git a/lapack-netlib/SRC/sormhr.f b/lapack-netlib/SRC/sormhr.f index 2d2053af4..e033feb38 100644 --- a/lapack-netlib/SRC/sormhr.f +++ b/lapack-netlib/SRC/sormhr.f @@ -171,7 +171,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmhr * * ===================================================================== SUBROUTINE SORMHR( SIDE, TRANS, M, N, ILO, IHI, A, LDA, TAU, C, @@ -199,7 +199,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORMQR, XERBLA @@ -253,7 +254,7 @@ NB = ILAENV( 1, 'SORMQR', SIDE // TRANS, M, NH, NH, -1 ) END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -285,7 +286,7 @@ CALL SORMQR( SIDE, TRANS, MI, NI, NH, A( ILO+1, ILO ), LDA, $ TAU( ILO ), C( I1, I2 ), LDC, WORK, LWORK, IINFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMHR diff --git a/lapack-netlib/SRC/sormlq.f b/lapack-netlib/SRC/sormlq.f index ee996e560..1a32568b6 100644 --- a/lapack-netlib/SRC/sormlq.f +++ b/lapack-netlib/SRC/sormlq.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmlq * * ===================================================================== SUBROUTINE SORMLQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -195,7 +195,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARFB, SLARFT, SORML2, XERBLA @@ -246,7 +247,7 @@ NB = MIN( NBMAX, ILAENV( 1, 'SORMLQ', SIDE // TRANS, M, N, K, $ -1 ) ) LWKOPT = NW*NB + TSIZE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -338,7 +339,7 @@ $ C( IC, JC ), LDC, WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMLQ diff --git a/lapack-netlib/SRC/sormql.f b/lapack-netlib/SRC/sormql.f index 72a8d22ee..9564d4141 100644 --- a/lapack-netlib/SRC/sormql.f +++ b/lapack-netlib/SRC/sormql.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmql * * ===================================================================== SUBROUTINE SORMQL( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -194,7 +194,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARFB, SLARFT, SORM2L, XERBLA @@ -249,7 +250,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -330,7 +331,7 @@ $ WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMQL diff --git a/lapack-netlib/SRC/sormqr.f b/lapack-netlib/SRC/sormqr.f index 5d4256f09..adb1203df 100644 --- a/lapack-netlib/SRC/sormqr.f +++ b/lapack-netlib/SRC/sormqr.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmqr * * ===================================================================== SUBROUTINE SORMQR( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -194,7 +194,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARFB, SLARFT, SORM2R, XERBLA @@ -245,7 +246,7 @@ NB = MIN( NBMAX, ILAENV( 1, 'SORMQR', SIDE // TRANS, M, N, K, $ -1 ) ) LWKOPT = NW*NB + TSIZE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -331,7 +332,7 @@ $ C( IC, JC ), LDC, WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMQR diff --git a/lapack-netlib/SRC/sormrq.f b/lapack-netlib/SRC/sormrq.f index 62fcdacdb..f091f0507 100644 --- a/lapack-netlib/SRC/sormrq.f +++ b/lapack-netlib/SRC/sormrq.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmrq * * ===================================================================== SUBROUTINE SORMRQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -195,7 +195,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARFB, SLARFT, SORMR2, XERBLA @@ -250,7 +251,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -337,7 +338,7 @@ $ WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMRQ diff --git a/lapack-netlib/SRC/sormrz.f b/lapack-netlib/SRC/sormrz.f index cdadd62b5..b037a984b 100644 --- a/lapack-netlib/SRC/sormrz.f +++ b/lapack-netlib/SRC/sormrz.f @@ -168,7 +168,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmrz * *> \par Contributors: * ================== @@ -213,7 +213,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARZB, SLARZT, SORMR3, XERBLA @@ -271,7 +272,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +368,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/sormtr.f b/lapack-netlib/SRC/sormtr.f index 3ba749fee..1bc87768f 100644 --- a/lapack-netlib/SRC/sormtr.f +++ b/lapack-netlib/SRC/sormtr.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmtr * * ===================================================================== SUBROUTINE SORMTR( SIDE, UPLO, TRANS, M, N, A, LDA, TAU, C, LDC, @@ -192,7 +192,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORMQL, SORMQR, XERBLA @@ -256,7 +257,7 @@ END IF END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -301,7 +302,7 @@ CALL SORMQR( SIDE, TRANS, MI, NI, NQ-1, A( 2, 1 ), LDA, TAU, $ C( I1, I2 ), LDC, WORK, LWORK, IINFO ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMTR diff --git a/lapack-netlib/SRC/spbsv.f b/lapack-netlib/SRC/spbsv.f index 2d084424e..8929321ea 100644 --- a/lapack-netlib/SRC/spbsv.f +++ b/lapack-netlib/SRC/spbsv.f @@ -119,9 +119,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/spbsvx.f b/lapack-netlib/SRC/spbsvx.f index 27907eb85..6abb6caae 100644 --- a/lapack-netlib/SRC/spbsvx.f +++ b/lapack-netlib/SRC/spbsvx.f @@ -71,7 +71,7 @@ *> where U is an upper triangular band matrix, and L is a lower *> triangular band matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -281,10 +281,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/spbtf2.f b/lapack-netlib/SRC/spbtf2.f index ff08bc327..b5aac1ccd 100644 --- a/lapack-netlib/SRC/spbtf2.f +++ b/lapack-netlib/SRC/spbtf2.f @@ -97,8 +97,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spbtrf.f b/lapack-netlib/SRC/spbtrf.f index ef5dcbb98..d3ae216e5 100644 --- a/lapack-netlib/SRC/spbtrf.f +++ b/lapack-netlib/SRC/spbtrf.f @@ -92,8 +92,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spftrf.f b/lapack-netlib/SRC/spftrf.f index 9e2c11eea..0ed0e3abd 100644 --- a/lapack-netlib/SRC/spftrf.f +++ b/lapack-netlib/SRC/spftrf.f @@ -91,8 +91,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/sposv.f b/lapack-netlib/SRC/sposv.f index fe0a35a56..336332d2d 100644 --- a/lapack-netlib/SRC/sposv.f +++ b/lapack-netlib/SRC/sposv.f @@ -110,9 +110,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/sposvx.f b/lapack-netlib/SRC/sposvx.f index bcf38c7ea..0770897d2 100644 --- a/lapack-netlib/SRC/sposvx.f +++ b/lapack-netlib/SRC/sposvx.f @@ -71,7 +71,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -277,10 +277,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/sposvxx.f b/lapack-netlib/SRC/sposvxx.f index 08adf1d58..19e599f64 100644 --- a/lapack-netlib/SRC/sposvxx.f +++ b/lapack-netlib/SRC/sposvxx.f @@ -88,7 +88,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A (see argument RCOND). If the reciprocal of the condition number diff --git a/lapack-netlib/SRC/spotf2.f b/lapack-netlib/SRC/spotf2.f index 5b3504834..773b768b2 100644 --- a/lapack-netlib/SRC/spotf2.f +++ b/lapack-netlib/SRC/spotf2.f @@ -89,8 +89,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spotrf.f b/lapack-netlib/SRC/spotrf.f index 5d5771c86..12ef58a40 100644 --- a/lapack-netlib/SRC/spotrf.f +++ b/lapack-netlib/SRC/spotrf.f @@ -87,8 +87,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spotrf2.f b/lapack-netlib/SRC/spotrf2.f index ae0484ce1..ef731ffa2 100644 --- a/lapack-netlib/SRC/spotrf2.f +++ b/lapack-netlib/SRC/spotrf2.f @@ -86,8 +86,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/sppsv.f b/lapack-netlib/SRC/sppsv.f index 2d3fb3d91..1f48dd4db 100644 --- a/lapack-netlib/SRC/sppsv.f +++ b/lapack-netlib/SRC/sppsv.f @@ -104,9 +104,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/sppsvx.f b/lapack-netlib/SRC/sppsvx.f index 7d71efcd5..bd2da20ee 100644 --- a/lapack-netlib/SRC/sppsvx.f +++ b/lapack-netlib/SRC/sppsvx.f @@ -69,7 +69,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -262,10 +262,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/spptrf.f b/lapack-netlib/SRC/spptrf.f index 0f90399cc..be51c4824 100644 --- a/lapack-netlib/SRC/spptrf.f +++ b/lapack-netlib/SRC/spptrf.f @@ -79,8 +79,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spteqr.f b/lapack-netlib/SRC/spteqr.f index 0d6bf911b..6e6e9aa45 100644 --- a/lapack-netlib/SRC/spteqr.f +++ b/lapack-netlib/SRC/spteqr.f @@ -123,8 +123,8 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, and i is: *> <= N the Cholesky factorization of the matrix could -*> not be performed because the i-th principal minor -*> was not positive definite. +*> not be performed because the leading principal +*> minor of order i was not positive. *> > N the SVD algorithm failed to converge; *> if INFO = N+i, i off-diagonal elements of the *> bidiagonal factor did not converge to zero. diff --git a/lapack-netlib/SRC/sptsv.f b/lapack-netlib/SRC/sptsv.f index 46aadf4c3..f11c22594 100644 --- a/lapack-netlib/SRC/sptsv.f +++ b/lapack-netlib/SRC/sptsv.f @@ -93,8 +93,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the solution has not been +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the solution has not been *> computed. The factorization has not been completed *> unless i = N. *> \endverbatim diff --git a/lapack-netlib/SRC/sptsvx.f b/lapack-netlib/SRC/sptsvx.f index 8dd04b054..eaa691cef 100644 --- a/lapack-netlib/SRC/sptsvx.f +++ b/lapack-netlib/SRC/sptsvx.f @@ -59,7 +59,7 @@ *> factorization can also be regarded as having the form *> A = U**T*D*U. *> -*> 2. If the leading i-by-i principal minor is not positive definite, +*> 2. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -199,10 +199,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/spttrf.f b/lapack-netlib/SRC/spttrf.f index 2217a7338..e083b7456 100644 --- a/lapack-netlib/SRC/spttrf.f +++ b/lapack-netlib/SRC/spttrf.f @@ -70,8 +70,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite; if k < N, the factorization could not +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive; if k < N, the factorization could not *> be completed, while if k = N, the factorization was *> completed, but D(N) <= 0. *> \endverbatim diff --git a/lapack-netlib/SRC/ssbev_2stage.f b/lapack-netlib/SRC/ssbev_2stage.f index 5752c1ecc..71ace4e27 100644 --- a/lapack-netlib/SRC/ssbev_2stage.f +++ b/lapack-netlib/SRC/ssbev_2stage.f @@ -131,7 +131,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS + N @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbev_2stage * *> \par Further Details: * ===================== @@ -182,7 +182,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -190,11 +190,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -232,12 +232,13 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSB - EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE + REAL SLAMCH, SLANSB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASCL, SSCAL, SSTEQR, SSTERF, XERBLA, - $ SSYTRD_SB2ST + $ SSYTRD_SB2ST * .. * .. Intrinsic Functions .. INTRINSIC SQRT @@ -268,7 +269,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', JOBZ, $ N, KD, -1, -1 ) @@ -277,7 +278,7 @@ LWTRD = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', JOBZ, $ N, KD, IB, -1 ) LWMIN = N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ENDIF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -343,7 +344,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, - $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, call SSTEQR. @@ -368,7 +369,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ssbevd.f b/lapack-netlib/SRC/ssbevd.f index bcf14ce85..e4118dbed 100644 --- a/lapack-netlib/SRC/ssbevd.f +++ b/lapack-netlib/SRC/ssbevd.f @@ -40,12 +40,6 @@ *> a real symmetric band matrix A. If eigenvectors are desired, it uses *> a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -185,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbevd * * ===================================================================== SUBROUTINE SSBEVD( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, WORK, @@ -219,8 +213,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANSB - EXTERNAL LSAME, SLAMCH, SLANSB + REAL SLAMCH, SLANSB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSB, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SLASCL, SSBTRD, SSCAL, SSTEDC, @@ -265,7 +259,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -348,7 +342,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/ssbevd_2stage.f b/lapack-netlib/SRC/ssbevd_2stage.f index 9687ee024..de3f1c010 100644 --- a/lapack-netlib/SRC/ssbevd_2stage.f +++ b/lapack-netlib/SRC/ssbevd_2stage.f @@ -45,12 +45,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses *> a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -140,7 +134,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS + N @@ -194,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbevd_2stage * *> \par Further Details: * ===================== @@ -212,7 +206,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -220,11 +214,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -264,8 +258,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSB - EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE + REAL SLAMCH, SLANSB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SLASCL, SSCAL, SSTEDC, @@ -313,7 +308,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -380,7 +375,7 @@ LLWRK2 = LWORK - INDWK2 + 1 * CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, - $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, call SSTEDC. @@ -400,7 +395,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/ssbevx_2stage.f b/lapack-netlib/SRC/ssbevx_2stage.f index 224b676b7..d25d3639a 100644 --- a/lapack-netlib/SRC/ssbevx_2stage.f +++ b/lapack-netlib/SRC/ssbevx_2stage.f @@ -235,7 +235,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 7*N, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS + 2*N @@ -281,7 +281,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbevx_2stage * *> \par Further Details: * ===================== @@ -299,7 +299,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -307,11 +307,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -357,8 +357,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSB - EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE + REAL SLAMCH, SLANSB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SGEMV, SLACPY, SLASCL, SSCAL, @@ -414,7 +415,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', JOBZ, $ N, KD, -1, -1 ) @@ -423,7 +424,7 @@ LWTRD = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', JOBZ, $ N, KD, IB, -1 ) LWMIN = 2*N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ENDIF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -513,7 +514,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ), - $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * If all eigenvalues are desired and ABSTOL is less than or equal @@ -624,7 +625,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ssbgvd.f b/lapack-netlib/SRC/ssbgvd.f index 6dd1fe952..f872e5464 100644 --- a/lapack-netlib/SRC/ssbgvd.f +++ b/lapack-netlib/SRC/ssbgvd.f @@ -43,12 +43,6 @@ *> banded, and B is also positive definite. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -214,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbgvd * *> \par Contributors: * ================== @@ -253,7 +247,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SPBSTF, SSBGST, SSBTRD, SSTEDC, @@ -298,7 +293,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -359,7 +354,7 @@ CALL SLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ ) END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssbgvx.f b/lapack-netlib/SRC/ssbgvx.f index 271f35964..3607fae82 100644 --- a/lapack-netlib/SRC/ssbgvx.f +++ b/lapack-netlib/SRC/ssbgvx.f @@ -267,7 +267,7 @@ *> Their indices are stored in IFAIL. *> > N: SPBSTF returned an error code; i.e., *> if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/sspevd.f b/lapack-netlib/SRC/sspevd.f index 56329da34..1aae48d1d 100644 --- a/lapack-netlib/SRC/sspevd.f +++ b/lapack-netlib/SRC/sspevd.f @@ -40,12 +40,6 @@ *> of a real symmetric matrix A in packed storage. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -170,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hpevd * * ===================================================================== SUBROUTINE SSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, @@ -204,8 +198,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANSP - EXTERNAL LSAME, SLAMCH, SLANSP + REAL SLAMCH, SLANSP, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSP, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SOPMTR, SSCAL, SSPTRD, SSTEDC, SSTERF, XERBLA @@ -246,7 +240,7 @@ END IF END IF IWORK( 1 ) = LIWMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -9 @@ -325,7 +319,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/sspgv.f b/lapack-netlib/SRC/sspgv.f index c73e94e60..e8bc66e5d 100644 --- a/lapack-netlib/SRC/sspgv.f +++ b/lapack-netlib/SRC/sspgv.f @@ -139,7 +139,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero. *> > N: if INFO = n + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f index 73862ed1b..c1e14594b 100644 --- a/lapack-netlib/SRC/sspgvd.f +++ b/lapack-netlib/SRC/sspgvd.f @@ -44,12 +44,6 @@ *> positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -184,7 +178,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -197,7 +191,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hpgvd * *> \par Contributors: * ================== @@ -231,7 +225,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPPTRF, SSPEVD, SSPGST, STPMV, STPSV, XERBLA @@ -273,7 +268,7 @@ LWMIN = 2*N END IF END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 @@ -351,7 +346,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sspgvx.f b/lapack-netlib/SRC/sspgvx.f index de581543a..6d5b4ed3d 100644 --- a/lapack-netlib/SRC/sspgvx.f +++ b/lapack-netlib/SRC/sspgvx.f @@ -245,7 +245,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/sstedc.f b/lapack-netlib/SRC/sstedc.f index 925b03422..0e1cb4258 100644 --- a/lapack-netlib/SRC/sstedc.f +++ b/lapack-netlib/SRC/sstedc.f @@ -42,12 +42,6 @@ *> found if SSYTRD or SSPTRD or SSBTRD has been used to reduce this *> matrix to tridiagonal form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See SLAED3 for details. *> \endverbatim * * Arguments: @@ -173,7 +167,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup auxOTHERcomputational +*> \ingroup stedc * *> \par Contributors: * ================== @@ -214,8 +208,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANST - EXTERNAL ILAENV, LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SLAED0, SLASCL, SLASET, SLASRT, @@ -274,7 +268,7 @@ LIWMIN = 3 + 5*N END IF END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT. LQUERY ) THEN @@ -469,7 +463,7 @@ END IF * 50 CONTINUE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sstegr.f b/lapack-netlib/SRC/sstegr.f index 2e2975fdf..2967a6969 100644 --- a/lapack-netlib/SRC/sstegr.f +++ b/lapack-netlib/SRC/sstegr.f @@ -56,7 +56,7 @@ *> *> Note : SSTEGR and SSTEMR work only on machines which follow *> IEEE-754 floating-point standard in their handling of infinities and -*> NaNs. Normal execution may create these exceptiona values and hence +*> NaNs. Normal execution may create these exceptional values and hence *> may abort due to a floating point exception in environments which *> do not conform to the IEEE-754 standard. *> \endverbatim diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f index 3a9bbe784..62cfa3d4d 100644 --- a/lapack-netlib/SRC/sstemr.f +++ b/lapack-netlib/SRC/sstemr.f @@ -303,7 +303,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup stemr * *> \par Contributors: * ================== @@ -312,7 +312,8 @@ *> Jim Demmel, University of California, Berkeley, USA \n *> Inderjit Dhillon, University of Texas, Austin, USA \n *> Osni Marques, LBNL/NERSC, USA \n -*> Christof Voemel, University of California, Berkeley, USA +*> Christof Voemel, University of California, Berkeley, USA \n +*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n * * ===================================================================== SUBROUTINE SSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, @@ -344,7 +345,8 @@ $ MINRGP = 3.0E-3 ) * .. * .. Local Scalars .. - LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY + LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY, + $ LAESWAP INTEGER I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW, $ IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD, $ INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP, @@ -357,8 +359,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANST - EXTERNAL LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SLAE2, SLAEV2, SLARRC, SLARRE, SLARRJ, @@ -378,6 +380,7 @@ * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) + LAESWAP = .FALSE. * SSTEMR needs WORK of size 6*N, IWORK of size 3*N. * In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N. @@ -440,7 +443,7 @@ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( WANTZ .AND. ALLEIG ) THEN @@ -500,6 +503,15 @@ ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN ) END IF +* D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However, +* the following code requires R1 >= R2. Hence, we correct +* the order of R1, R2, CS, SN if R1 < R2 before further processing. + IF( R1.LT.R2 ) THEN + E(2) = R1 + R1 = R2 + R2 = E(2) + LAESWAP = .TRUE. + ENDIF IF( ALLEIG.OR. $ (VALEIG.AND.(R2.GT.WL).AND. $ (R2.LE.WU)).OR. @@ -507,8 +519,13 @@ M = M+1 W( M ) = R2 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = -SN - Z( 2, M ) = CS + IF( LAESWAP ) THEN + Z( 1, M ) = CS + Z( 2, M ) = SN + ELSE + Z( 1, M ) = -SN + Z( 2, M ) = CS + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -531,8 +548,13 @@ M = M+1 W( M ) = R1 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = CS - Z( 2, M ) = SN + IF( LAESWAP ) THEN + Z( 1, M ) = -SN + Z( 2, M ) = CS + ELSE + Z( 1, M ) = CS + Z( 2, M ) = SN + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -760,7 +782,7 @@ ENDIF * * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/sstevd.f b/lapack-netlib/SRC/sstevd.f index bc5b5aaab..4fc2a6311 100644 --- a/lapack-netlib/SRC/sstevd.f +++ b/lapack-netlib/SRC/sstevd.f @@ -40,12 +40,6 @@ *> real symmetric tridiagonal matrix. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -155,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup stevd * * ===================================================================== SUBROUTINE SSTEVD( JOBZ, N, D, E, Z, LDZ, WORK, LWORK, IWORK, @@ -188,8 +182,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANST - EXTERNAL LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSCAL, SSTEDC, SSTERF, XERBLA @@ -221,7 +215,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -289,7 +283,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, D, 1 ) * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sstevr.f b/lapack-netlib/SRC/sstevr.f index 2ab63eb3e..42f49b11b 100644 --- a/lapack-netlib/SRC/sstevr.f +++ b/lapack-netlib/SRC/sstevr.f @@ -287,7 +287,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup stevr * *> \par Contributors: * ================== @@ -336,8 +336,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANST - EXTERNAL LSAME, ILAENV, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTEMR, SSTEIN, SSTERF, @@ -389,7 +389,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -570,7 +570,7 @@ * IF (wantz .and. INDEIG ) Z( 1,1) = Z(1,1) / 1.002 + .002 * * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/ssyconvf.f b/lapack-netlib/SRC/ssyconvf.f index 6defc1f0e..af55da51a 100644 --- a/lapack-netlib/SRC/ssyconvf.f +++ b/lapack-netlib/SRC/ssyconvf.f @@ -39,7 +39,7 @@ *> SSYTRF provided on entry in parameter A into the factorization *> output format used in SSYTRF_RK (or SSYTRF_BK) that is stored *> on exit in parameters A and E. It also converts in place details of -*> the intechanges stored in IPIV from the format used in SSYTRF into +*> the interchanges stored in IPIV from the format used in SSYTRF into *> the format used in SSYTRF_RK (or SSYTRF_BK). *> *> If parameter WAY = 'R': @@ -48,7 +48,7 @@ *> (or SSYTRF_BK) provided on entry in parameters A and E into *> the factorization output format used in SSYTRF that is stored *> on exit in parameter A. It also converts in place details of -*> the intechanges stored in IPIV from the format used in SSYTRF_RK +*> the interchanges stored in IPIV from the format used in SSYTRF_RK *> (or SSYTRF_BK) into the format used in SSYTRF. *> \endverbatim * @@ -322,7 +322,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -466,7 +466,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -532,7 +532,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/ssyconvf_rook.f b/lapack-netlib/SRC/ssyconvf_rook.f index c59f257bb..efd7c57fe 100644 --- a/lapack-netlib/SRC/ssyconvf_rook.f +++ b/lapack-netlib/SRC/ssyconvf_rook.f @@ -517,7 +517,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/ssyev.f b/lapack-netlib/SRC/ssyev.f index 03ed326b8..638445f04 100644 --- a/lapack-netlib/SRC/ssyev.f +++ b/lapack-netlib/SRC/ssyev.f @@ -125,7 +125,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heev * * ===================================================================== SUBROUTINE SSYEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO ) @@ -158,8 +158,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF, SSYTRD, @@ -190,7 +190,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, ( NB+2 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, 3*N-1 ) .AND. .NOT.LQUERY ) $ INFO = -8 @@ -274,7 +274,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssyev_2stage.f b/lapack-netlib/SRC/ssyev_2stage.f index a6fa30cc8..519ee334d 100644 --- a/lapack-netlib/SRC/ssyev_2stage.f +++ b/lapack-netlib/SRC/ssyev_2stage.f @@ -20,7 +20,7 @@ * Definition: * =========== * -* SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, * INFO ) * * IMPLICIT NONE @@ -105,12 +105,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -143,7 +143,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -161,7 +161,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -169,16 +169,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== - SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, $ INFO ) * IMPLICIT NONE @@ -211,8 +211,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSY - EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV2STAGE + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF, @@ -305,7 +306,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, first call @@ -336,7 +337,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index ac0d0284d..2ae44fc81 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -40,13 +40,6 @@ *> real symmetric matrix A. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. -*> *> Because of large use of BLAS of level 3, SSYEVD needs N**2 more *> workspace than SSYEVX. *> \endverbatim @@ -103,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, -*> dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -167,7 +159,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevd * *> \par Contributors: * ================== @@ -211,8 +203,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLACPY, SLASCL, SORMTR, SSCAL, SSTEDC, SSTERF, @@ -258,7 +250,7 @@ $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -342,7 +334,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/ssyevd_2stage.f b/lapack-netlib/SRC/ssyevd_2stage.f index f3fde6b4a..e63e280a7 100644 --- a/lapack-netlib/SRC/ssyevd_2stage.f +++ b/lapack-netlib/SRC/ssyevd_2stage.f @@ -45,12 +45,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f index d8e4ce3ea..870facd60 100644 --- a/lapack-netlib/SRC/ssyevr.f +++ b/lapack-netlib/SRC/ssyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for SSYTRD and SORMTR *> returned by ILAENV. @@ -292,7 +293,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK and @@ -317,7 +319,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -368,8 +370,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN, @@ -392,8 +394,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -428,7 +435,7 @@ NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -677,7 +684,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f index 8ab2844c6..471e25977 100644 --- a/lapack-netlib/SRC/ssyevr_2stage.f +++ b/lapack-netlib/SRC/ssyevr_2stage.f @@ -263,7 +263,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of SSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the orthogonal transformations applied by SORMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -277,12 +277,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 5*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -330,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -358,7 +360,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -366,11 +368,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -416,8 +418,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ILAENV2STAGE - REAL SLAMCH, SLANSY - EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV, ILAENV2STAGE + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK, ILAENV, + $ ILAENV2STAGE * .. * .. External Subroutines .. EXTERNAL SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN, @@ -444,8 +447,14 @@ IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -484,7 +493,7 @@ * NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN END IF * @@ -504,7 +513,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 26 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -608,7 +617,7 @@ * Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. * * - CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) * @@ -732,7 +741,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f index 11776e8c5..aaed6dad5 100644 --- a/lapack-netlib/SRC/ssyevx.f +++ b/lapack-netlib/SRC/ssyevx.f @@ -244,7 +244,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE SSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -285,8 +285,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ, @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 @@ -542,7 +542,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssyevx_2stage.f b/lapack-netlib/SRC/ssyevx_2stage.f index 1a2225c87..a8585e5f7 100644 --- a/lapack-netlib/SRC/ssyevx_2stage.f +++ b/lapack-netlib/SRC/ssyevx_2stage.f @@ -208,12 +208,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 8*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 3*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 3*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -259,7 +259,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevx_2stage * *> \par Further Details: * ===================== @@ -277,7 +277,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -285,11 +285,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -334,8 +334,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSY - EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV2STAGE + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ, @@ -388,7 +389,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE KD = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', JOBZ, $ N, -1, -1, -1 ) @@ -487,7 +488,7 @@ INDWRK = INDHOUS + LHTRD LLWORK = LWORK - INDWRK + 1 * - CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWRK ), LLWORK, IINFO ) * @@ -600,7 +601,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ssygv.f b/lapack-netlib/SRC/ssygv.f index 270957fce..3a79f5431 100644 --- a/lapack-netlib/SRC/ssygv.f +++ b/lapack-netlib/SRC/ssygv.f @@ -154,7 +154,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -167,7 +167,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup hegv * * ===================================================================== SUBROUTINE SSYGV( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, @@ -199,7 +199,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPOTRF, SSYEV, SSYGST, STRMM, STRSM, XERBLA @@ -234,7 +235,7 @@ LWKMIN = MAX( 1, 3*N - 1 ) NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKMIN, ( NB + 2 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -11 @@ -303,7 +304,7 @@ END IF END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SSYGV diff --git a/lapack-netlib/SRC/ssygv_2stage.f b/lapack-netlib/SRC/ssygv_2stage.f index 49f357d90..8719d8c7a 100644 --- a/lapack-netlib/SRC/ssygv_2stage.f +++ b/lapack-netlib/SRC/ssygv_2stage.f @@ -143,12 +143,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -173,7 +173,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -186,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup hegv_2stage * *> \par Further Details: * ===================== @@ -204,7 +204,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -212,11 +212,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -252,7 +252,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPOTRF, SSYGST, STRMM, STRSM, XERBLA, @@ -359,7 +360,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of SSYGV_2STAGE diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f index 7c7e0de01..a90d1afb7 100644 --- a/lapack-netlib/SRC/ssygvd.f +++ b/lapack-netlib/SRC/ssygvd.f @@ -42,12 +42,6 @@ *> B are assumed to be symmetric and B is also positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -190,7 +184,7 @@ *> the submatrix lying in rows and columns INFO/(N+1) *> through mod(INFO,N+1); *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -203,7 +197,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup hegvd * *> \par Further Details: * ===================== @@ -251,7 +245,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPOTRF, SSYEVD, SSYGST, STRMM, STRSM, XERBLA @@ -295,7 +290,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -367,7 +362,7 @@ END IF END IF * - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/ssygvx.f b/lapack-netlib/SRC/ssygvx.f index e93da60cc..16adefa22 100644 --- a/lapack-netlib/SRC/ssygvx.f +++ b/lapack-netlib/SRC/ssygvx.f @@ -270,7 +270,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim @@ -283,7 +283,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup hegvx * *> \par Contributors: * ================== @@ -324,7 +324,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPOTRF, SSYEVX, SSYGST, STRMM, STRSM, XERBLA @@ -380,7 +381,7 @@ LWKMIN = MAX( 1, 8*N ) NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -453,7 +454,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysv.f b/lapack-netlib/SRC/ssysv.f index 06a42dfb7..523ea66c1 100644 --- a/lapack-netlib/SRC/ssysv.f +++ b/lapack-netlib/SRC/ssysv.f @@ -163,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesv * * ===================================================================== SUBROUTINE SSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -190,7 +190,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRF, SSYTRS, SSYTRS2 @@ -225,7 +226,7 @@ CALL SSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -258,7 +259,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index 5661332c5..711a275e1 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE SSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,11 +177,13 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRS_AA, SSYTRF_AA @@ -195,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -205,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) - WORK( 1 ) = LWKOPT + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -238,7 +241,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index b9eac8342..fb068b3bf 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -88,7 +88,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is REAL array, dimension (LTB) +*> TB is REAL array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -178,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE SSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -204,11 +205,13 @@ * .. * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE, @@ -225,6 +228,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -233,18 +237,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -254,7 +259,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -268,7 +272,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_rk.f b/lapack-netlib/SRC/ssysv_rk.f index 9a7dfa4bb..abf862d66 100644 --- a/lapack-netlib/SRC/ssysv_rk.f +++ b/lapack-netlib/SRC/ssysv_rk.f @@ -205,7 +205,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleSYsolve +*> \ingroup hesv_rk * *> \par Contributors: * ================== @@ -247,7 +247,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRF_RK, SSYTRS_3 @@ -282,7 +283,7 @@ CALL SSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -305,7 +306,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_rook.f b/lapack-netlib/SRC/ssysv_rook.f index fb7ba8c53..c5c77e562 100644 --- a/lapack-netlib/SRC/ssysv_rook.f +++ b/lapack-netlib/SRC/ssysv_rook.f @@ -181,7 +181,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesv_rook * *> \par Contributors: * ================== @@ -223,7 +223,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRF_ROOK, SSYTRS_ROOK @@ -258,7 +259,7 @@ CALL SSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -281,7 +282,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f index b19ce2641..06a6413f1 100644 --- a/lapack-netlib/SRC/ssysvx.f +++ b/lapack-netlib/SRC/ssysvx.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE SSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -305,14 +305,14 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLACPY, SSYCON, SSYRFS, SSYTRF, SSYTRS, XERBLA @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,17 +345,17 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -404,7 +405,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssytrd.f b/lapack-netlib/SRC/ssytrd.f index f09ad9ab4..f4fbecdc9 100644 --- a/lapack-netlib/SRC/ssytrd.f +++ b/lapack-netlib/SRC/ssytrd.f @@ -139,7 +139,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd * *> \par Further Details: * ===================== @@ -223,7 +223,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -248,7 +249,7 @@ * NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) LWKOPT = N*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -365,7 +366,7 @@ $ TAU( I ), IINFO ) END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SSYTRD diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f index 5d70ae0d4..5b401c3d0 100644 --- a/lapack-netlib/SRC/ssytrd_2stage.f +++ b/lapack-netlib/SRC/ssytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download SSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download SSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * REAL A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is REAL array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is REAL array, dimension (LHOUS2) +*> HOUS2 is REAL array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -149,17 +151,19 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +181,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +206,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +214,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +269,13 @@ * KD = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +316,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN END IF - CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SB2ST', -INFO ) @@ -324,8 +331,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of SSYTRD_2STAGE diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index faeeff5f4..111eaa93e 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the ssytrd_sy2sb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the ssytrd_sy2sb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the ssytrd_sy2sb *> routine has been called to produce AB (e.g., AB is *> the output of ssytrd_sy2sb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is REAL array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is REAL array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension LWORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> IF N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup real16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * #if defined(_OPENMP) @@ -258,11 +263,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SISEV, SIZETAU, LDV, LHMIN, LWMIN * .. @@ -274,15 +279,15 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + INTEGER ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -291,9 +296,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -314,8 +324,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = LWMIN + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -355,7 +365,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -363,11 +373,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* real because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -382,17 +392,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = ( AB( ABDPOS, I ) ) @@ -413,7 +423,7 @@ RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the symmetric band of A to a tridiagonal matrix. * THGRSIZ = N @@ -422,7 +432,7 @@ NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL SLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -431,7 +441,7 @@ * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -440,7 +450,7 @@ #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -451,7 +461,7 @@ ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -477,16 +487,16 @@ ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -494,20 +504,20 @@ !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, - $ WORK( INDW + TID*KD ) ) -#endif + $ WORK( INDW ) ) +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -522,14 +532,14 @@ !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -543,11 +553,10 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SB2ST * END - + diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index 2c92cd14a..3996e07bb 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -124,7 +124,7 @@ *> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -283,7 +285,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -293,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -313,7 +320,7 @@ CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -506,7 +513,7 @@ END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SY2SB diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index d188589b9..55f3a4f0f 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -202,7 +202,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASYF, SSYTF2, XERBLA @@ -232,8 +233,8 @@ * Determine the block size * NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -352,7 +353,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f index 13e849cdc..af32fb064 100644 --- a/lapack-netlib/SRC/ssytrf_aa.f +++ b/lapack-netlib/SRC/ssytrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -142,24 +144,25 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), WORK( * ) + REAL A( LDA, * ), WORK( * ) * .. * * ===================================================================== * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB - REAL ALPHA + REAL ALPHA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASYF_AA, SGEMV, SSCAL, SCOPY, SSWAP, SGEMM, @@ -179,19 +182,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -203,11 +213,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * @@ -457,7 +467,8 @@ END IF * 20 CONTINUE - WORK( 1 ) = LWKOPT +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_AA diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index 94b3aa4b5..6b5cdee1b 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -94,7 +94,7 @@ *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -187,7 +187,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SCOPY, SLACPY, @@ -211,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -227,10 +228,10 @@ NB = ILAENV( 1, 'SSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -239,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f index ec84fcb1b..89ecf38fd 100644 --- a/lapack-netlib/SRC/ssytrf_rk.f +++ b/lapack-netlib/SRC/ssytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension ( MAX(1,LWORK) ). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -280,7 +280,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASYF_RK, SSYTF2_RK, SSWAP, XERBLA @@ -310,8 +311,8 @@ * Determine the block size * NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = LWKOPT + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -487,7 +488,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_RK diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f index 653289e2b..7c2cbbc57 100644 --- a/lapack-netlib/SRC/ssytrf_rook.f +++ b/lapack-netlib/SRC/ssytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== @@ -228,7 +228,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASYF_ROOK, SSYTF2_ROOK, XERBLA @@ -259,7 +260,7 @@ * NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -382,7 +383,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_ROOK diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 49f6cad65..fd1c53473 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE SSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRI, SSYTRI2X, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - - IF( NBMAX .GE. N ) THEN +* + IF( NBMAX.GE.N ) THEN CALL SSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL SSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of SSYTRI2 diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f index 58d5df92a..f0152a149 100644 --- a/lapack-netlib/SRC/ssytri_3.f +++ b/lapack-netlib/SRC/ssytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -152,7 +153,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -190,7 +191,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRI_3X, XERBLA @@ -208,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -217,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -225,7 +232,6 @@ CALL XERBLA( 'SSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT RETURN END IF * @@ -236,7 +242,7 @@ * CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f index 3cfa2a206..265cf0c1d 100644 --- a/lapack-netlib/SRC/ssytrs_aa.f +++ b/lapack-netlib/SRC/ssytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -123,7 +129,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -141,7 +147,7 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), B( LDB, * ), WORK( * ) + REAL A( LDA, * ), B( LDB, * ), WORK( * ) * .. * * ===================================================================== @@ -151,23 +157,31 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGTSV, SSWAP, SLACPY, STRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -178,21 +192,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/stgevc.f b/lapack-netlib/SRC/stgevc.f index 15fc88c4b..be4cb1829 100644 --- a/lapack-netlib/SRC/stgevc.f +++ b/lapack-netlib/SRC/stgevc.f @@ -52,7 +52,7 @@ *> *> S*x = w*P*x, (y**H)*S = w*(y**H)*P, *> -*> where y**H denotes the conjugate tranpose of y. +*> where y**H denotes the conjugate transpose of y. *> The eigenvalues are not input to this routine, but are computed *> directly from the diagonal blocks of S and P. *> @@ -337,7 +337,7 @@ EXTERNAL LSAME, SLAMCH * .. * .. External Subroutines .. - EXTERNAL SGEMV, SLABAD, SLACPY, SLAG2, SLALN2, XERBLA + EXTERNAL SGEMV, SLACPY, SLAG2, SLALN2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -463,7 +463,6 @@ * SAFMIN = SLAMCH( 'Safe minimum' ) BIG = ONE / SAFMIN - CALL SLABAD( SAFMIN, BIG ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) SMALL = SAFMIN*N / ULP BIG = ONE / SMALL diff --git a/lapack-netlib/SRC/stgex2.f b/lapack-netlib/SRC/stgex2.f index d61da2327..885739ab5 100644 --- a/lapack-netlib/SRC/stgex2.f +++ b/lapack-netlib/SRC/stgex2.f @@ -255,7 +255,7 @@ $ THRESHA, THRESHB * .. * .. Local Arrays .. - INTEGER IWORK( LDST ) + INTEGER IWORK( LDST + 2 ) REAL AI( 2 ), AR( 2 ), BE( 2 ), IR( LDST, LDST ), $ IRCOP( LDST, LDST ), LI( LDST, LDST ), $ LICOP( LDST, LDST ), S( LDST, LDST ), diff --git a/lapack-netlib/SRC/stgexc.f b/lapack-netlib/SRC/stgexc.f index d1ad79936..d68eb5fc7 100644 --- a/lapack-netlib/SRC/stgexc.f +++ b/lapack-netlib/SRC/stgexc.f @@ -195,7 +195,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup tgexc * *> \par Contributors: * ================== @@ -241,6 +241,10 @@ LOGICAL LQUERY INTEGER HERE, LWMIN, NBF, NBL, NBNEXT * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL STGEX2, XERBLA * .. @@ -533,7 +537,7 @@ $ GO TO 20 END IF ILST = HERE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of STGEXC diff --git a/lapack-netlib/SRC/stgsen.f b/lapack-netlib/SRC/stgsen.f index f1103d740..ac9c4677a 100644 --- a/lapack-netlib/SRC/stgsen.f +++ b/lapack-netlib/SRC/stgsen.f @@ -304,7 +304,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup tgsen * *> \par Further Details: * ===================== @@ -490,8 +490,8 @@ $ XERBLA * .. * .. External Functions .. - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SIGN, SQRT @@ -571,7 +571,7 @@ LIWMIN = 1 END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -852,7 +852,7 @@ END IF 70 CONTINUE * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/stgsna.f b/lapack-netlib/SRC/stgsna.f index 40f822915..e8cb28b95 100644 --- a/lapack-netlib/SRC/stgsna.f +++ b/lapack-netlib/SRC/stgsna.f @@ -230,7 +230,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup tgsna * *> \par Further Details: * ===================== @@ -416,8 +416,9 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SDOT, SLAMCH, SLAPY2, SNRM2 - EXTERNAL LSAME, SDOT, SLAMCH, SLAPY2, SNRM2 + REAL SDOT, SLAMCH, SLAPY2, SNRM2, SROUNDUP_LWORK + EXTERNAL LSAME, SDOT, SLAMCH, SLAPY2, SNRM2, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMV, SLACPY, SLAG2, STGEXC, STGSYL, XERBLA @@ -490,7 +491,7 @@ ELSE LWMIN = N END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( MM.LT.M ) THEN INFO = -15 @@ -632,8 +633,8 @@ C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA ) C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI ROOT1 = C1 + SQRT( C1*C1-4.0*C2 ) - ROOT2 = C2 / ROOT1 ROOT1 = ROOT1 / TWO + ROOT2 = C2 / ROOT1 COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) ) END IF * @@ -689,7 +690,7 @@ $ KS = KS + 1 * 20 CONTINUE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of STGSNA diff --git a/lapack-netlib/SRC/stgsyl.f b/lapack-netlib/SRC/stgsyl.f index 733c8ab9c..07a82e380 100644 --- a/lapack-netlib/SRC/stgsyl.f +++ b/lapack-netlib/SRC/stgsyl.f @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup tgsyl * *> \par Contributors: * ================== @@ -331,7 +331,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SLASET, SSCAL, STGSY2, XERBLA @@ -384,7 +385,7 @@ ELSE LWMIN = 1 END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -670,7 +671,7 @@ * END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f index 5af57123b..253cbc24c 100644 --- a/lapack-netlib/SRC/strevc3.f +++ b/lapack-netlib/SRC/strevc3.f @@ -298,7 +298,7 @@ * INFO = 0 NB = ILAENV( 1, 'STREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN diff --git a/lapack-netlib/SRC/strsen.f b/lapack-netlib/SRC/strsen.f index c0f75fb12..f7a05ae8b 100644 --- a/lapack-netlib/SRC/strsen.f +++ b/lapack-netlib/SRC/strsen.f @@ -231,7 +231,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup trsen * *> \par Further Details: * ===================== @@ -346,8 +346,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLANGE - EXTERNAL LSAME, SLANGE + REAL SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLACN2, SLACPY, STREXC, STRSYL, XERBLA @@ -427,7 +427,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN END IF * @@ -558,7 +558,7 @@ END IF 60 CONTINUE * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f index 28762c2ed..ef3f2da83 100644 --- a/lapack-netlib/SRC/strsyl3.f +++ b/lapack-netlib/SRC/strsyl3.f @@ -1223,7 +1223,7 @@ * SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) BUF = BUF * SCALOC - CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) ) END IF * * Combine with buffer scaling factor. SCALE will be flushed if diff --git a/lapack-netlib/SRC/stzrzf.f b/lapack-netlib/SRC/stzrzf.f index e8cbb56b6..516bea5d4 100644 --- a/lapack-netlib/SRC/stzrzf.f +++ b/lapack-netlib/SRC/stzrzf.f @@ -116,7 +116,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup tzrzf * *> \par Contributors: * ================== @@ -179,7 +179,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -207,7 +208,7 @@ LWKOPT = M*NB LWKMIN = MAX( 1, M ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 @@ -301,7 +302,7 @@ IF( MU.GT.0 ) $ CALL SLATRZ( MU, N, N-M, A, LDA, TAU, WORK ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/zbbcsd.f b/lapack-netlib/SRC/zbbcsd.f index 757e98c71..6601f4a06 100644 --- a/lapack-netlib/SRC/zbbcsd.f +++ b/lapack-netlib/SRC/zbbcsd.f @@ -804,7 +804,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/zbdsqr.f b/lapack-netlib/SRC/zbdsqr.f index 2ad6142a7..865bb9dd5 100644 --- a/lapack-netlib/SRC/zbdsqr.f +++ b/lapack-netlib/SRC/zbdsqr.f @@ -204,6 +204,17 @@ *> algorithm through its inner loop. The algorithms stops *> (and so fails to converge) if the number of passes *> through the inner loop exceeds MAXITR*N**2. +*> +*> \endverbatim +* +*> \par Note: +* =========== +*> +*> \verbatim +*> Bug report from Cezary Dendek. +*> On November 3rd 2023, the INTEGER variable MAXIT = MAXITR*N**2 is +*> removed since it can overflow pretty easily (for N larger or equal +*> than 18,919). We instead use MAXITDIVN = MAXITR*N. *> \endverbatim * * Authors: @@ -214,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup bdsqr * * ===================================================================== SUBROUTINE ZBDSQR( UPLO, N, NCVT, NRU, NCC, D, E, VT, LDVT, U, @@ -255,11 +266,11 @@ * .. * .. Local Scalars .. LOGICAL LOWER, ROTATE - INTEGER I, IDIR, ISUB, ITER, J, LL, LLL, M, MAXIT, NM1, - $ NM12, NM13, OLDLL, OLDM + INTEGER I, IDIR, ISUB, ITER, ITERDIVN, J, LL, LLL, M, + $ MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM DOUBLE PRECISION ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, - $ SINR, SLL, SMAX, SMIN, SMINL, SMINOA, + $ SINR, SLL, SMAX, SMIN, SMINOA, $ SN, THRESH, TOL, TOLMUL, UNFL * .. * .. External Functions .. @@ -372,7 +383,7 @@ DO 30 I = 1, N - 1 SMAX = MAX( SMAX, ABS( E( I ) ) ) 30 CONTINUE - SMINL = ZERO + SMIN = ZERO IF( TOL.GE.ZERO ) THEN * * Relative accuracy desired @@ -389,20 +400,21 @@ 40 CONTINUE 50 CONTINUE SMINOA = SMINOA / SQRT( DBLE( N ) ) - THRESH = MAX( TOL*SMINOA, MAXITR*N*N*UNFL ) + THRESH = MAX( TOL*SMINOA, MAXITR*(N*(N*UNFL)) ) ELSE * * Absolute accuracy desired * - THRESH = MAX( ABS( TOL )*SMAX, MAXITR*N*N*UNFL ) + THRESH = MAX( ABS( TOL )*SMAX, MAXITR*(N*(N*UNFL)) ) END IF * * Prepare for main iteration loop for the singular values * (MAXIT is the maximum number of passes through the inner * loop permitted before nonconvergence signalled.) * - MAXIT = MAXITR*N*N - ITER = 0 + MAXITDIVN = MAXITR*N + ITERDIVN = 0 + ITER = -1 OLDLL = -1 OLDM = -1 * @@ -418,15 +430,18 @@ * IF( M.LE.1 ) $ GO TO 160 - IF( ITER.GT.MAXIT ) - $ GO TO 200 + IF( ITER.GE.N ) THEN + ITER = ITER - N + ITERDIVN = ITERDIVN + 1 + IF( ITERDIVN.GE.MAXITDIVN ) + $ GO TO 200 + END IF * * Find diagonal block of matrix to work on * IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH ) $ D( M ) = ZERO SMAX = ABS( D( M ) ) - SMIN = SMAX DO 70 LLL = 1, M - 1 LL = M - LLL ABSS = ABS( D( LL ) ) @@ -435,7 +450,6 @@ $ D( LL ) = ZERO IF( ABSE.LE.THRESH ) $ GO TO 80 - SMIN = MIN( SMIN, ABSS ) SMAX = MAX( SMAX, ABSS, ABSE ) 70 CONTINUE LL = 0 @@ -517,14 +531,14 @@ * apply convergence criterion forward * MU = ABS( D( LL ) ) - SMINL = MU + SMIN = MU DO 100 LLL = LL, M - 1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 100 CONTINUE END IF * @@ -545,14 +559,14 @@ * apply convergence criterion backward * MU = ABS( D( M ) ) - SMINL = MU + SMIN = MU DO 110 LLL = M - 1, LL, -1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 110 CONTINUE END IF END IF @@ -562,7 +576,7 @@ * Compute shift. First, test if shifting would ruin relative * accuracy, and if so set the shift to zero. * - IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE. + IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE. $ MAX( EPS, HNDRTH*TOL ) ) THEN * * Use a zero shift to avoid loss of relative accuracy diff --git a/lapack-netlib/SRC/zcposv.f b/lapack-netlib/SRC/zcposv.f index 9b1940d60..649d2c049 100644 --- a/lapack-netlib/SRC/zcposv.f +++ b/lapack-netlib/SRC/zcposv.f @@ -187,10 +187,10 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of -*> (COMPLEX*16) A is not positive definite, so the -*> factorization could not be completed, and the solution -*> has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of (COMPLEX*16) A is not positive, so the factorization +*> could not be completed, and the solution has not been +*> computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zgebal.f b/lapack-netlib/SRC/zgebal.f index d4a9e39f1..a467991d4 100644 --- a/lapack-netlib/SRC/zgebal.f +++ b/lapack-netlib/SRC/zgebal.f @@ -89,7 +89,7 @@ *> \param[out] IHI *> \verbatim *> IHI is INTEGER -*> ILO and IHI are set to INTEGER such that on exit +*> ILO and IHI are set to integers such that on exit *> A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N. *> If JOB = 'N' or 'S', ILO = 1 and IHI = N. *> \endverbatim @@ -155,6 +155,9 @@ *> *> Modified by Tzu-Yi Chen, Computer Science Division, University of *> California at Berkeley, USA +*> +*> Refactored by Evert Provoost, Department of Computer Science, +*> KU Leuven, Belgium *> \endverbatim *> * ===================================================================== @@ -184,8 +187,8 @@ PARAMETER ( FACTOR = 0.95D+0 ) * .. * .. Local Scalars .. - LOGICAL NOCONV - INTEGER I, ICA, IEXC, IRA, J, K, L, M + LOGICAL NOCONV, CANSWAP + INTEGER I, ICA, IRA, J, K, L DOUBLE PRECISION C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2 * .. @@ -217,176 +220,194 @@ RETURN END IF * - K = 1 - L = N +* Quick returns. * - IF( N.EQ.0 ) - $ GO TO 210 + IF( N.EQ.0 ) THEN + ILO = 1 + IHI = 0 + RETURN + END IF * IF( LSAME( JOB, 'N' ) ) THEN - DO 10 I = 1, N + DO I = 1, N SCALE( I ) = ONE - 10 CONTINUE - GO TO 210 + END DO + ILO = 1 + IHI = N + RETURN END IF * - IF( LSAME( JOB, 'S' ) ) - $ GO TO 120 -* -* Permutation to isolate eigenvalues if possible -* - GO TO 50 -* -* Row and column exchange. -* - 20 CONTINUE - SCALE( M ) = J - IF( J.EQ.M ) - $ GO TO 30 -* - CALL ZSWAP( L, A( 1, J ), 1, A( 1, M ), 1 ) - CALL ZSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA ) -* - 30 CONTINUE - GO TO ( 40, 80 )IEXC -* -* Search for rows isolating an eigenvalue and push them down. -* - 40 CONTINUE - IF( L.EQ.1 ) - $ GO TO 210 - L = L - 1 -* - 50 CONTINUE - DO 70 J = L, 1, -1 +* Permutation to isolate eigenvalues if possible. * - DO 60 I = 1, L - IF( I.EQ.J ) - $ GO TO 60 - IF( DBLE( A( J, I ) ).NE.ZERO .OR. DIMAG( A( J, I ) ).NE. - $ ZERO )GO TO 70 - 60 CONTINUE -* - M = L - IEXC = 1 - GO TO 20 - 70 CONTINUE -* - GO TO 90 + K = 1 + L = N * -* Search for columns isolating an eigenvalue and push them left. + IF( .NOT.LSAME( JOB, 'S' ) ) THEN * - 80 CONTINUE - K = K + 1 +* Row and column exchange. * - 90 CONTINUE - DO 110 J = K, L + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for rows isolating an eigenvalue and push them down. +* + NOCONV = .FALSE. + DO I = L, 1, -1 + CANSWAP = .TRUE. + DO J = 1, L + IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR. + $ DIMAG( A( I, J ) ).NE.ZERO ) ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( L ) = I + IF( I.NE.L ) THEN + CALL ZSWAP( L, A( 1, I ), 1, A( 1, L ), 1 ) + CALL ZSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA ) + END IF + NOCONV = .TRUE. +* + IF( L.EQ.1 ) THEN + ILO = 1 + IHI = 1 + RETURN + END IF +* + L = L - 1 + END IF + END DO +* + END DO + + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for columns isolating an eigenvalue and push them left. +* + NOCONV = .FALSE. + DO J = K, L + CANSWAP = .TRUE. + DO I = K, L + IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR. + $ DIMAG( A( I, J ) ).NE.ZERO ) ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( K ) = J + IF( J.NE.K ) THEN + CALL ZSWAP( L, A( 1, J ), 1, A( 1, K ), 1 ) + CALL ZSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA ) + END IF + NOCONV = .TRUE. +* + K = K + 1 + END IF + END DO +* + END DO * - DO 100 I = K, L - IF( I.EQ.J ) - $ GO TO 100 - IF( DBLE( A( I, J ) ).NE.ZERO .OR. DIMAG( A( I, J ) ).NE. - $ ZERO )GO TO 110 - 100 CONTINUE + END IF * - M = K - IEXC = 2 - GO TO 20 - 110 CONTINUE +* Initialize SCALE for non-permuted submatrix. * - 120 CONTINUE - DO 130 I = K, L + DO I = K, L SCALE( I ) = ONE - 130 CONTINUE + END DO * - IF( LSAME( JOB, 'P' ) ) - $ GO TO 210 +* If we only had to permute, we are done. +* + IF( LSAME( JOB, 'P' ) ) THEN + ILO = K + IHI = L + RETURN + END IF * * Balance the submatrix in rows K to L. * -* Iterative loop for norm reduction +* Iterative loop for norm reduction. * SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 - 140 CONTINUE - NOCONV = .FALSE. -* - DO 200 I = K, L -* - C = DZNRM2( L-K+1, A( K, I ), 1 ) - R = DZNRM2( L-K+1, A( I, K ), LDA ) - ICA = IZAMAX( L, A( 1, I ), 1 ) - CA = ABS( A( ICA, I ) ) - IRA = IZAMAX( N-K+1, A( I, K ), LDA ) - RA = ABS( A( I, IRA+K-1 ) ) -* -* Guard against zero C or R due to underflow. -* - IF( C.EQ.ZERO .OR. R.EQ.ZERO ) - $ GO TO 200 - G = R / SCLFAC - F = ONE - S = C + R - 160 CONTINUE - IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. - $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 - IF( DISNAN( C+F+CA+R+G+RA ) ) THEN * -* Exit if NaN to avoid infinite loop + NOCONV = .TRUE. + DO WHILE( NOCONV ) + NOCONV = .FALSE. * - INFO = -3 - CALL XERBLA( 'ZGEBAL', -INFO ) - RETURN - END IF - F = F*SCLFAC - C = C*SCLFAC - CA = CA*SCLFAC - R = R / SCLFAC - G = G / SCLFAC - RA = RA / SCLFAC - GO TO 160 -* - 170 CONTINUE - G = C / SCLFAC - 180 CONTINUE - IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. - $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 - F = F / SCLFAC - C = C / SCLFAC - G = G / SCLFAC - CA = CA / SCLFAC - R = R*SCLFAC - RA = RA*SCLFAC - GO TO 180 -* -* Now balance. -* - 190 CONTINUE - IF( ( C+R ).GE.FACTOR*S ) - $ GO TO 200 - IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN - IF( F*SCALE( I ).LE.SFMIN1 ) - $ GO TO 200 - END IF - IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN - IF( SCALE( I ).GE.SFMAX1 / F ) - $ GO TO 200 - END IF - G = ONE / F - SCALE( I ) = SCALE( I )*F - NOCONV = .TRUE. + DO I = K, L * - CALL ZDSCAL( N-K+1, G, A( I, K ), LDA ) - CALL ZDSCAL( L, F, A( 1, I ), 1 ) + C = DZNRM2( L-K+1, A( K, I ), 1 ) + R = DZNRM2( L-K+1, A( I, K ), LDA ) + ICA = IZAMAX( L, A( 1, I ), 1 ) + CA = ABS( A( ICA, I ) ) + IRA = IZAMAX( N-K+1, A( I, K ), LDA ) + RA = ABS( A( I, IRA+K-1 ) ) * - 200 CONTINUE +* Guard against zero C or R due to underflow. +* + IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE +* +* Exit if NaN to avoid infinite loop * - IF( NOCONV ) - $ GO TO 140 + IF( DISNAN( C+CA+R+RA ) ) THEN + INFO = -3 + CALL XERBLA( 'ZGEBAL', -INFO ) + RETURN + END IF +* + G = R / SCLFAC + F = ONE + S = C + R +* + DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND. + $ MIN( R, G, RA ).GT.SFMIN2 ) + F = F*SCLFAC + C = C*SCLFAC + CA = CA*SCLFAC + R = R / SCLFAC + G = G / SCLFAC + RA = RA / SCLFAC + END DO +* + G = C / SCLFAC +* + DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND. + $ MIN( F, C, G, CA ).GT.SFMIN2 ) + F = F / SCLFAC + C = C / SCLFAC + G = G / SCLFAC + CA = CA / SCLFAC + R = R*SCLFAC + RA = RA*SCLFAC + END DO +* +* Now balance. +* + IF( ( C+R ).GE.FACTOR*S ) CYCLE + IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN + IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE + END IF + IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN + IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE + END IF + G = ONE / F + SCALE( I ) = SCALE( I )*F + NOCONV = .TRUE. +* + CALL ZDSCAL( N-K+1, G, A( I, K ), LDA ) + CALL ZDSCAL( L, F, A( 1, I ), 1 ) +* + END DO +* + END DO * - 210 CONTINUE ILO = K IHI = L * diff --git a/lapack-netlib/SRC/zgebrd.f b/lapack-netlib/SRC/zgebrd.f index f1791c6a4..c1a6169a7 100644 --- a/lapack-netlib/SRC/zgebrd.f +++ b/lapack-netlib/SRC/zgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEBD2, ZGEMM, ZLABRD @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/zgecon.f b/lapack-netlib/SRC/zgecon.f index 9cbfe35bc..ef567d7c2 100644 --- a/lapack-netlib/SRC/zgecon.f +++ b/lapack-netlib/SRC/zgecon.f @@ -105,8 +105,15 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> =-5: if ANORM is NAN or negative. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> NaNs are illegal values for ANORM, and they propagate to +*> the output parameter RCOND. +*> Infinity is illegal for ANORM, and it propagates to the output +*> parameter RCOND as 0. +*> = 1: if RCOND = NaN, or +*> RCOND = Inf, or +*> the computed norm of the inverse of A is 0. +*> In the latter, RCOND = 0 is returned. *> \endverbatim * * Authors: @@ -117,7 +124,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gecon * * ===================================================================== SUBROUTINE ZGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK, @@ -147,7 +154,7 @@ LOGICAL ONENRM CHARACTER NORMIN INTEGER IX, KASE, KASE1 - DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU + DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL COMPLEX*16 ZDUM * .. * .. Local Arrays .. @@ -172,6 +179,8 @@ CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. +* + HUGEVAL = DLAMCH( 'Overflow' ) * * Test the input parameters. * @@ -183,7 +192,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN + ELSE IF( ANORM.LT.ZERO ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN @@ -199,6 +208,13 @@ RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN + ELSE IF( DISNAN( ANORM ) ) THEN + RCOND = ANORM + INFO = -5 + RETURN + ELSE IF( ANORM.GT.HUGEVAL ) THEN + INFO = -5 + RETURN END IF * SMLNUM = DLAMCH( 'Safe minimum' ) @@ -256,8 +272,17 @@ * * Compute the estimate of the reciprocal condition number. * - IF( AINVNM.NE.ZERO ) - $ RCOND = ( ONE / AINVNM ) / ANORM + IF( AINVNM.NE.ZERO ) THEN + RCOND = ( ONE / AINVNM ) / ANORM + ELSE + INFO = 1 + RETURN + END IF +* +* Check for NaNs and Infs +* + IF( DISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL ) + $ INFO = 1 * 20 CONTINUE RETURN diff --git a/lapack-netlib/SRC/zgedmd.c b/lapack-netlib/SRC/zgedmd.c new file mode 100644 index 000000000..c1b39ba3e --- /dev/null +++ b/lapack-netlib/SRC/zgedmd.c @@ -0,0 +1,1676 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0 */ +/* The state space dimension (the row dimension of X, Y). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshot pairs */ +/* (the number of columns of X and Y). */ +/* ..... */ +/* X (input/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, X contains the data snapshot matrix X. It is */ +/* assumed that the column norms of X are in the range of */ +/* the normalized floating point numbers. */ +/* < On exit, the leading K columns of X contain a POD basis, */ +/* i.e. the leading K left singular vectors of the input */ +/* data matrix X, U(:,1:K). All N columns of X contain all */ +/* left singular vectors of the input matrix X. */ +/* See the descriptions of K, Z and W. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= M */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, Y contains the data snapshot matrix Y */ +/* < On exit, */ +/* If JOBR == 'R', the leading K columns of Y contain */ +/* the residual vectors for the computed Ritz pairs. */ +/* See the description of RES. */ +/* If JOBR == 'N', Y contains the original input data, */ +/* scaled according to the value of JOBS. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= M */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the descriptions of TOL and K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the POD basis for the data snapshot */ +/* matrix X and the number of the computed Ritz pairs. */ +/* The value of K is determined according to the rule set */ +/* by the parameters NRNK and TOL. */ +/* See the descriptions of NRNK and TOL. */ +/* ..... */ +/* EIGS (output) COMPLEX(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of EIGS contain */ +/* the computed eigenvalues (Ritz values). */ +/* See the descriptions of K, and Z. */ +/* ..... */ +/* Z (workspace/output) COMPLEX(KIND=WP) M-by-N array */ +/* If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */ +/* is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */ +/* the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */ +/* is an eigenvector corresponding to EIGS(i). The columns */ +/* of W(1:k,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. */ +/* See the descriptions of EIGS, X and W. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) N-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs, */ +/* RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */ +/* See the description of EIGS and Z. */ +/* ..... */ +/* B (output) COMPLEX(KIND=WP) M-by-N array. */ +/* IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:M,1:K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* If JOBF =='N', then B is not referenced. */ +/* See the descriptions of X, W, K. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= M */ +/* The leading dimension of the array B. */ +/* ..... */ +/* W (workspace/output) COMPLEX(KIND=WP) N-by-N array */ +/* On exit, W(1:K,1:K) contains the K computed */ +/* eigenvectors of the matrix Rayleigh quotient. */ +/* The Ritz vectors (returned in Z) are the */ +/* product of X (containing a POD basis for the input */ +/* matrix X) and W. See the descriptions of K, S, X and Z. */ +/* W is also used as a workspace to temporarily store the */ +/* right singular vectors of X. */ +/* ..... */ +/* LDW (input) INTEGER, LDW >= N */ +/* The leading dimension of the array W. */ +/* ..... */ +/* S (workspace/output) COMPLEX(KIND=WP) N-by-N array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by ZGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N */ +/* The leading dimension of the array S. */ +/* ..... */ +/* ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */ +/* ZWORK is used as complex workspace in the complex SVD, as */ +/* specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing */ +/* the eigenvalues of a Rayleigh quotient. */ +/* If the call to ZGEDMD is only workspace query, then */ +/* ZWORK(1) contains the minimal complex workspace length and */ +/* ZWORK(2) is the optimal complex workspace length. */ +/* Hence, the length of work is at least 2. */ +/* See the description of LZWORK. */ +/* ..... */ +/* LZWORK (input) INTEGER */ +/* The minimal length of the workspace vector ZWORK. */ +/* LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), */ +/* where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal */ +/* LZWORK_SVD is calculated as follows */ +/* If WHTSVD == 1 :: ZGESVD :: */ +/* LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */ +/* If WHTSVD == 2 :: ZGESDD :: */ +/* LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */ +/* If WHTSVD == 3 :: ZGESVDQ :: */ +/* LZWORK_SVD = obtainable by a query */ +/* If WHTSVD == 4 :: ZGEJSV :: */ +/* LZWORK_SVD = obtainable by a query */ +/* If on entry LZWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths and returns them in */ +/* LZWORK(1) and LZWORK(2), respectively. */ +/* ..... */ +/* RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */ +/* On exit, RWORK(1:N) contains the singular values of */ +/* X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */ +/* If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */ +/* scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */ +/* and Y to avoid overflow in the SVD of X. */ +/* This may be of interest if the scaling option is off */ +/* and as many as possible smallest eigenvalues are */ +/* desired to the highest feasible accuracy. */ +/* If the call to ZGEDMD is only workspace query, then */ +/* RWORK(1) contains the minimal workspace length. */ +/* See the description of LRWORK. */ +/* ..... */ +/* LRWORK (input) INTEGER */ +/* The minimal length of the workspace vector RWORK. */ +/* LRWORK is calculated as follows: */ +/* LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where */ +/* LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */ +/* for the SVD subroutine determined by the input parameter */ +/* WHTSVD. */ +/* If WHTSVD == 1 :: ZGESVD :: */ +/* LRWORK_SVD = 5*MIN(M,N) */ +/* If WHTSVD == 2 :: ZGESDD :: */ +/* LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */ +/* 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */ +/* If WHTSVD == 3 :: ZGESVDQ :: */ +/* LRWORK_SVD = obtainable by a query */ +/* If WHTSVD == 4 :: ZGEJSV :: */ +/* LRWORK_SVD = obtainable by a query */ +/* If on entry LRWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* real workspace length and returns it in RWORK(1). */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for ZWORK, RWORK and */ +/* IWORK. See the descriptions of ZWORK, RWORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* Local arrays */ +/* ~~~~~~~~~~~~ */ +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* ............................................................ */ + /* Parameter adjustments */ + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --eigs; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + w_dim1 = *ldw; + w_offset = 1 + w_dim1 * 1; + w -= w_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --zwork; + --rwork; + --iwork; + + /* Function Body */ + zero = 0.f; + one = 1.f; + zzero.r = 0.f, zzero.i = 0.f; + zone.r = 1.f, zone.i = 0.f; + +/* Test the input arguments */ + + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + *info = 0; + lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || lsame_(jobz, "N") || lsame_( + jobz, "F"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && ! wntvec) { + *info = -3; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -4; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -5; + } else if (*m < 0) { + *info = -6; + } else if (*n < 0 || *n > *m) { + *info = -7; + } else if (*ldx < *m) { + *info = -9; + } else if (*ldy < *m) { + *info = -11; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -12; + } else if (*tol < zero || *tol >= one) { + *info = -13; + } else if (*ldz < *m) { + *info = -17; + } else if ((wntref || wntex) && *ldb < *m) { + *info = -20; + } else if (*ldw < *n) { + *info = -22; + } else if (*lds < *n) { + *info = -24; + } + + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0) { +/* Quick return. All output except K is void. */ +/* INFO=1 signals the void input. */ +/* In case of a workspace query, the default */ +/* minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + rwork[1] = 1.; + zwork[1].r = 2., zwork[1].i = 0.; + zwork[2].r = 2., zwork[2].i = 0.; + } else { + *k = 0; + } + *info = 1; + return 0; + } + iminwr = 1; + mlrwrk = f2cmax(1,*n); + mlwork = 2; + olwork = 2; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of ZGESVD: */ +/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */ +/* Computing MAX */ + i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n); + mwrsvd = f2cmax(i__1,i__2); + mlwork = f2cmax(mlwork,mwrsvd); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5; + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[ + b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, + rdummy, &info1); + lwrsvd = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvd); + } + } else if (*whtsvd == 2) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of ZGESDD: */ +/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */ +/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */ +/* In LAPACK 3.10.1 RWORK is defined differently. */ +/* Below we take f2cmax over the two versions. */ +/* IMINWR = 8*MIN(M,N) */ + mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax( + *m,*n); + mlwork = f2cmax(mlwork,mwrsdd); + iminwr = f2cmin(*m,*n) << 3; +/* Computing MAX */ +/* Computing MAX */ + i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(* + m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3, + i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) + << 1) * f2cmin(*m,*n) + f2cmin(*m,*n); + i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], + ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, & + iwork[1], &info1); +/* Computing MAX */ + i__1 = mwrsdd, i__2 = (integer) zwork[1].r; + lwrsdd = f2cmax(i__1,i__2); +/* Possible bug in ZGESDD optimal workspace size. */ + olwork = f2cmax(olwork,lwrsdd); + } + } else if (*whtsvd == 3) { + zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[ + 1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, & + iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1); + iminwr = iwork[1]; + mwrsvq = (integer) zwork[2].r; + mlwork = f2cmax(mlwork,mwrsvq); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0]; + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + lwrsvq = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvq); + } + } else if (*whtsvd == 4) { + *(unsigned char *)jsvopt = 'J'; + zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, + &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[ + 1], &c_n1, rdummy, &c_n1, &iwork[1], &info1); + iminwr = iwork[1]; + mwrsvj = (integer) zwork[2].r; + mlwork = f2cmax(mlwork,mwrsvj); +/* Computing MAX */ +/* Computing MAX */ + i__3 = 7, i__4 = (integer) rdummy[0]; + i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + lwrsvj = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvj); + } +/* END SELECT */ + } + if (wntvec || wntex || lsame_(jobz, "F")) { + *(unsigned char *)jobzl = 'V'; + } else { + *(unsigned char *)jobzl = 'N'; + } +/* Workspace calculation to the ZGEEV call */ +/* Computing MAX */ + i__1 = 1, i__2 = *n << 1; + mwrkev = f2cmax(i__1,i__2); + mlwork = f2cmax(mlwork,mwrkev); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + (*n << 1); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + zgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], + ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], & + info1); + lwrkev = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrkev); + } + + if (*liwork < iminwr && ! lquery) { + *info = -30; + } + if (*lrwork < mlrwrk && ! lquery) { + *info = -28; + } + if (*lzwork < mlwork && ! lquery) { + *info = -26; + } + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGEDMD", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + rwork[1] = (doublereal) mlrwrk; + zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.; + zwork[2].r = (doublereal) olwork, zwork[2].i = 0.; + return 0; + } +/* ............................................................ */ + + ofl = dlamch_("O"); + small = dlamch_("S"); + badxy = FALSE_; + +/* <1> Optional scaling of the snapshots (columns of X, Y) */ +/* ========================================================== */ + if (sccolx) { +/* The columns of X will be normalized. */ +/* To prevent overflows, the column norms of X are */ +/* carefully computed using ZLASSQ. */ + *k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DZNRM2( M, X(1,i), 1 ) */ + scale = zero; + zlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum); + if (disnan_(&scale) || disnan_(&ssum)) { + *k = 0; + *info = -8; + i__2 = -(*info); + xerbla_("ZGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of X(:,i) overflows. First, X(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */ +/* Next, the norm of X(:,i) is stored without */ +/* overflow as RWORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of X(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + d__1 = one / rootsc; + zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[ + i__ * x_dim1 + 1], ldx, &info2); + rwork[i__] = -scale * (rootsc / (doublereal) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + rwork[i__] = scale * rootsc; + zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, & + x[i__ * x_dim1 + 1], ldx, &info2); +/* X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC */ +/* LAPACK CALL */ + } + } else { + rwork[i__] = zero; + ++(*k); + } + } + if (*k == *n) { +/* All columns of X are zero. Return error code -8. */ +/* (the 8th input variable had an illegal value) */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("ZGEDMD", &i__1); + return 0; + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of Y. */ + if (rwork[i__] > zero) { + d__1 = one / rwork[i__]; + zdscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1); +/* Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (rwork[i__] < zero) { + d__1 = -rwork[i__]; + d__2 = one / (doublereal) (*m); + zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * + y_dim1 + 1], ldy, &info2); +/* LAPACK C */ + } else if (z_abs(&y[izamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ + * y_dim1]) != zero) { +/* X(:,i) is zero vector. For consistency, */ +/* Y(:,i) should also be zero. If Y(:,i) is not */ +/* zero, then the data might be inconsistent or */ +/* corrupted. If JOBS == 'C', Y(:,i) is set to */ +/* zero and a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + if (lsame_(jobs, "C")) { + zdscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1); + } +/* BLAS CALL */ + } + } + } + + if (sccoly) { +/* The columns of Y will be normalized. */ +/* To prevent overflows, the column norms of Y are */ +/* carefully computed using ZLASSQ. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* RWORK(i) = DZNRM2( M, Y(1,i), 1 ) */ + scale = zero; + zlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum); + if (disnan_(&scale) || disnan_(&ssum)) { + *k = 0; + *info = -10; + i__2 = -(*info); + xerbla_("ZGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of Y(:,i) overflows. First, Y(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */ +/* Next, the norm of Y(:,i) is stored without */ +/* overflow as RWORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of Y(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + d__1 = one / rootsc; + zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[ + i__ * y_dim1 + 1], ldy, &info2); + rwork[i__] = -scale * (rootsc / (doublereal) (*m)); + } else { +/* Y(:,i) will be scaled to unit 2-norm */ + rwork[i__] = scale * rootsc; + zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, & + y[i__ * y_dim1 + 1], ldy, &info2); +/* Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* LAPAC */ + } + } else { + rwork[i__] = zero; + } + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of X. */ + if (rwork[i__] > zero) { + d__1 = one / rwork[i__]; + zdscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1); +/* X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (rwork[i__] < zero) { + d__1 = -rwork[i__]; + d__2 = one / (doublereal) (*m); + zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * + x_dim1 + 1], ldx, &info2); +/* LAPACK C */ + } else if (z_abs(&x[izamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ + * x_dim1]) != zero) { +/* Y(:,i) is zero vector. If X(:,i) is not */ +/* zero, then a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + } + } + } + +/* <2> SVD of the data snapshot matrix X. */ +/* ===================================== */ +/* The left singular vectors are stored in the array X. */ +/* The right singular vectors are in the array W. */ +/* The array W will later on contain the eigenvectors */ +/* of a Rayleigh quotient. */ + numrnk = *n; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { + zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], + ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], & + info1); +/* LA */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 2) { + zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, & + w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1] + , &info1); +/* LAP */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 3) { + i__1 = *lrwork - *n; + zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], + &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], + liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1); +/* LAPACK CA */ + zlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK C */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 4) { + i__1 = *lrwork - *n; + zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, & + rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], + lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1); + zlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK CALL */ + *(unsigned char *)t_or_n__ = 'N'; + xscl1 = rwork[*n + 1]; + xscl2 = rwork[*n + 2]; + if (xscl1 != xscl2) { +/* This is an exceptional situation. If the */ +/* data matrices are not scaled and the */ +/* largest singular value of X overflows. */ +/* In that case ZGEJSV can return the SVD */ +/* in scaled form. The scaling factor can be used */ +/* to rescale the data (X and Y). */ + zlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], + ldy, &info2); + } +/* END SELECT */ + } + + if (info1 > 0) { +/* The SVD selected subroutine did not converge. */ +/* Return with an error code. */ + *info = 2; + return 0; + } + + if (rwork[1] == zero) { +/* The largest computed singular value of (scaled) */ +/* X is zero. Return error code -8 */ +/* (the 8th input variable had an illegal value). */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("ZGEDMD", &i__1); + return 0; + } + +/* <3> Determine the numerical rank of the data */ +/* snapshots matrix X. This depends on the */ +/* parameters NRNK and TOL. */ +/* SELECT CASE ( NRNK ) */ + if (*nrnk == -1) { + *k = 1; + i__1 = numrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else if (*nrnk == -2) { + *k = 1; + i__1 = numrnk - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else { + *k = 1; + i__1 = *nrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } +/* END SELECT */ + } +/* Now, U = X(1:M,1:K) is the SVD/POD basis for the */ +/* snapshot data in the input matrix X. */ +/* <4> Compute the Rayleigh quotient S = U^H * A * U. */ +/* Depending on the requested outputs, the computation */ +/* is organized to compute additional auxiliary */ +/* matrices (for the residuals and refinements). */ + +/* In all formulas below, we need V_k*Sigma_k^(-1) */ +/* where either V_k is in W(1:N,1:K), or V_k^H is in */ +/* W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */ + if (lsame_(t_or_n__, "N")) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + d__1 = one / rwork[i__]; + zdscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1); +/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i) ! INTRINSIC */ +/* BLAS CALL */ + } + } else { +/* This non-unit stride access is due to the fact */ +/* that ZGESVD, ZGESVDQ and ZGESDD return the */ +/* adjoint matrix of the right singular vectors. */ +/* DO i = 1, K */ +/* CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW ) ! BLAS CALL */ +/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N) ! INTRINSIC */ +/* END DO */ + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + rwork[*n + i__] = one / rwork[i__]; + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *k; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * w_dim1; + i__4 = *n + i__; + z__2.r = rwork[i__4], z__2.i = zero; + i__5 = i__ + j * w_dim1; + z__1.r = z__2.r * w[i__5].r - z__2.i * w[i__5].i, z__1.i = + z__2.r * w[i__5].i + z__2.i * w[i__5].r; + w[i__3].r = z__1.r, w[i__3].i = z__1.i; + } + } + } + + if (wntref) { + +/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */ +/* for computing the refined Ritz vectors */ +/* (optionally, outside ZGEDMD). */ + zgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset], + ldw, &zzero, &z__[z_offset], ldz); +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */ +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! */ + +/* At this point Z contains */ +/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */ +/* this is needed for computing the residuals. */ +/* This matrix is returned in the array B and */ +/* it can be used to compute refined Ritz vectors. */ +/* BLA */ + zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); +/* B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC */ +/* BLAS CALL */ + zgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], + ldz, &zzero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) */ +/* At this point S = U^H * A * U is the Rayleigh quotient. */ +/* BLA */ + } else { +/* A * U(:,1:K) is not explicitly needed and the */ +/* computation is organized differently. The Rayleigh */ +/* quotient is computed more efficiently. */ + zgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy, + &zzero, &z__[z_offset], ldz); +/* Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) */ + + zgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */ +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! */ +/* At this point S = U^H * A * U is the Rayleigh quotient. */ +/* If the residuals are requested, save scaled V_k into Z. */ +/* Recall that V_k or V_k^H is stored in W. */ +/* BLAS */ + if (wntres || wntex) { + if (lsame_(t_or_n__, "N")) { + zlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz); + } else { + zlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz); + } + } + } + +/* <5> Compute the Ritz values and (if requested) the */ +/* right eigenvectors of the Rayleigh quotient. */ + + zgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[ + w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1); + +/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */ +/* quotient. See the description of Z. */ +/* Also, see the description of ZGEEV. */ +/* LAPACK CALL */ + if (info1 > 0) { +/* ZGEEV failed to compute the eigenvalues and */ +/* eigenvectors of the Rayleigh quotient. */ + *info = 3; + return 0; + } + +/* <6> Compute the eigenvectors (if requested) and, */ +/* the residuals (if requested). */ + + if (wntvec || wntex) { + if (wntres) { + if (wntref) { +/* Here, if the refinement is requested, we have */ +/* A*U(:,1:K) already computed and stored in Z. */ +/* For the residuals, need Y = A * U(:,1;K) * W. */ + zgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &y[y_offset], ldy); +/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC */ +/* This frees Z; Y contains A * U(:,1:K) * W. */ +/* BLAS CALL */ + } else { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */ + zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[ + s_offset], lds, &zzero, &z__[z_offset], ldz); +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ + zlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy); + if (wntex) { + zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); + } + } + } else if (wntex) { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], + lds, &zzero, &b[b_offset], ldb); +/* The above call replaces the following two calls */ +/* that were used in the developing-testing phase. */ +/* CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */ +/* LDS, ZZERO, Z, LDZ) */ +/* Save a copy of Z into B and free Z for holding */ +/* the Ritz vectors. */ +/* CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) */ + } + +/* Compute the Ritz vectors */ + if (wntvec) { + zgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], + ldw, &zzero, &z__[z_offset], ldz); + } +/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC */ + +/* BLAS CALL */ + if (wntres) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + z__1.r = -eigs[i__2].r, z__1.i = -eigs[i__2].i; + zaxpy_(m, &z__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * + y_dim1 + 1], &c__1); +/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i) ! INTR */ +/* BLAS */ + res[i__] = dznrm2_(m, &y[i__ * y_dim1 + 1], &c__1); +/* BLAS */ + } + } + } + + if (*whtsvd == 4) { + rwork[*n + 1] = xscl1; + rwork[*n + 2] = xscl2; + } + +/* Successful exit. */ + if (! badxy) { + *info = 0; + } else { +/* A warning on possible data inconsistency. */ +/* This should be a rare event. */ + *info = 4; + } +/* ............................................................ */ + return 0; +/* ...... */ +} /* zgedmd_ */ + diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90 new file mode 100644 index 000000000..5045cb166 --- /dev/null +++ b/lapack-netlib/SRC/zgedmd.f90 @@ -0,0 +1,1148 @@ +!> \brief \b ZGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!...... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 +! +!...... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> ZGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, ZGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, ZGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!............................................................ +! Arguments +! ========= +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim +!..... +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim +!..... +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim +!..... +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim +!..... +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: ZGESVD (the QR SVD algorithm) +!> 2 :: ZGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim +!..... +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim +!..... +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim +!..... +!> \param[in] LDX +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!..... +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim +!..... +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim +!..... +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim +!..... +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim +!..... +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim +!..... +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim +!..... +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim +!..... +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim +!..... +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim +!..... +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim +!..... +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim +!..... +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim +!..... +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim +!..... +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim +!..... +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by ZGEEV. +!> See the description of K. +!> \endverbatim +!..... +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim +!..... +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to ZGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim +!..... +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), +!> where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: ZGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: ZGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim +!..... +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to ZGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim +!..... +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where +!> LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: ZGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: ZGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim +!..... +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim +!..... +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim +!..... +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! +!............................................................. +!............................................................. + SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, EIGS, Z, LDZ, RES, B, LDB, & + W, LDW, S, LDS, ZWORK, LZWORK, & + RWORK, LRWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LIWORK, LRWORK, LZWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: RWORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +! +! Local scalars +! ~~~~~~~~~~~~~ + REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & + SSUM, XSCL1, XSCL2 + INTEGER :: i, j, IMINWR, INFO1, INFO2, & + LWRKEV, LWRSDD, LWRSVD, LWRSVJ, & + LWRSVQ, MLWORK, MWRKEV, MWRSDD, & + MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, & + OLWORK, MLRWRK + LOGICAL :: BADXY, LQUERY, SCCOLX, SCCOLY, & + WNTEX, WNTREF, WNTRES, WNTVEC + CHARACTER :: JOBZL, T_OR_N + CHARACTER :: JSVOPT +! +! Local arrays +! ~~~~~~~~~~~~ + REAL(KIND=WP) :: RDUMMY(2) +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2 + EXTERNAL ZLANGE, DLAMCH, DZNRM2, IZAMAX + INTEGER IZAMAX + LOGICAL DISNAN, LSAME + EXTERNAL DISNAN, LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL ZAXPY, ZGEMM, ZDSCAL + EXTERNAL ZGEEV, ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, & + ZLACPY, ZLASCL, ZLASSQ, XERBLA +! +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC DBLE, INT, MAX, SQRT +!............................................................ +! +! Test the input arguments +! + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C') + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + INFO = 0 + LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) & + .OR. ( LRWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N') & + .OR. LSAME(JOBZ,'F')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. (.NOT.WNTVEC) ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -4 + ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR. & + (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN + INFO = -5 + ELSE IF ( M < 0 ) THEN + INFO = -6 + ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN + INFO = -7 + ELSE IF ( LDX < M ) THEN + INFO = -9 + ELSE IF ( LDY < M ) THEN + INFO = -11 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -12 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -13 + ELSE IF ( LDZ < M ) THEN + INFO = -17 + ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN + INFO = -20 + ELSE IF ( LDW < N ) THEN + INFO = -22 + ELSE IF ( LDS < N ) THEN + INFO = -24 + END IF +! + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( N == 0 ) THEN + ! Quick return. All output except K is void. + ! INFO=1 signals the void input. + ! In case of a workspace query, the default + ! minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + RWORK(1) = 1 + ZWORK(1) = 2 + ZWORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + + IMINWR = 1 + MLRWRK = MAX(1,N) + MLWORK = 2 + OLWORK = 2 + SELECT CASE ( WHTSVD ) + CASE (1) + ! The following is specified as the minimal + ! length of WORK in the definition of ZGESVD: + ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) + MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) + MLWORK = MAX(MLWORK,MWRSVD) + MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N)) + IF ( LQUERY ) THEN + CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, & + B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 ) + LWRSVD = INT( ZWORK(1) ) + OLWORK = MAX(OLWORK,LWRSVD) + END IF + CASE (2) + ! The following is specified as the minimal + ! length of WORK in the definition of ZGESDD: + ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N). + ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) + ! In LAPACK 3.10.1 RWORK is defined differently. + ! Below we take max over the two versions. + ! IMINWR = 8*MIN(M,N) + MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) + MLWORK = MAX(MLWORK,MWRSDD) + IMINWR = 8*MIN(M,N) + MLRWRK = MAX( MLRWRK, N + & + MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), & + 5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), & + 2*MAX(M,N)*MIN(M,N)+ & + 2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) + IF ( LQUERY ) THEN + CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B,LDB,& + W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 ) + LWRSDD = MAX( MWRSDD,INT( ZWORK(1) )) + ! Possible bug in ZGESDD optimal workspace size. + OLWORK = MAX(OLWORK,LWRSDD) + END IF + CASE (3) + CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK, & + IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 ) + IMINWR = IWORK(1) + MWRSVQ = INT(ZWORK(2)) + MLWORK = MAX(MLWORK,MWRSVQ) + MLRWRK = MAX(MLRWRK,N + INT(RDUMMY(1))) + IF ( LQUERY ) THEN + LWRSVQ = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,LWRSVQ) + END IF + CASE (4) + JSVOPT = 'J' + CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, & + N, X, LDX, RWORK, Z, LDZ, W, LDW, & + ZWORK, -1, RDUMMY, -1, IWORK, INFO1 ) + IMINWR = IWORK(1) + MWRSVJ = INT(ZWORK(2)) + MLWORK = MAX(MLWORK,MWRSVJ) + MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1)))) + IF ( LQUERY ) THEN + LWRSVJ = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,LWRSVJ) + END IF + END SELECT + IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN + JOBZL = 'V' + ELSE + JOBZL = 'N' + END IF + ! Workspace calculation to the ZGEEV call + MWRKEV = MAX( 1, 2*N ) + MLWORK = MAX(MLWORK,MWRKEV) + MLRWRK = MAX(MLRWRK,N+2*N) + IF ( LQUERY ) THEN + CALL ZGEEV( 'N', JOBZL, N, S, LDS, EIGS, & + W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 ) + LWRKEV = INT(ZWORK(1)) + OLWORK = MAX( OLWORK, LWRKEV ) + END IF +! + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30 + IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28 + IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26 + + END IF +! + IF( INFO /= 0 ) THEN + CALL XERBLA( 'ZGEDMD', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + RWORK(1) = MLRWRK + ZWORK(1) = MLWORK + ZWORK(2) = OLWORK + RETURN + END IF +!............................................................ +! + OFL = DLAMCH('O') + SMALL = DLAMCH('S') + BADXY = .FALSE. +! +! <1> Optional scaling of the snapshots (columns of X, Y) +! ========================================================== + IF ( SCCOLX ) THEN + ! The columns of X will be normalized. + ! To prevent overflows, the column norms of X are + ! carefully computed using ZLASSQ. + K = 0 + DO i = 1, N + !WORK(i) = DZNRM2( M, X(1,i), 1 ) + SSUM = ONE + SCALE = ZERO + CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM ) + IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN + K = 0 + INFO = -8 + CALL XERBLA('ZGEDMD',-INFO) + END IF + IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of X(:,i) overflows. First, X(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. +! Next, the norm of X(:,i) is stored without +! overflow as RWORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of X(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, X(1,i), LDX, INFO2 ) + RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + RWORK(i) = SCALE * ROOTSC + CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, & + X(1,i), LDX, INFO2 ) ! LAPACK CALL +! X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC + END IF + ELSE + RWORK(i) = ZERO + K = K + 1 + END IF + END DO + IF ( K == N ) THEN + ! All columns of X are zero. Return error code -8. + ! (the 8th input variable had an illegal value) + K = 0 + INFO = -8 + CALL XERBLA('ZGEDMD',-INFO) + RETURN + END IF + DO i = 1, N +! Now, apply the same scaling to the columns of Y. + IF ( RWORK(i) > ZERO ) THEN + CALL ZDSCAL( M, ONE/RWORK(i), Y(1,i), 1 ) ! BLAS CALL +! Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC + ELSE IF ( RWORK(i) < ZERO ) THEN + CALL ZLASCL( 'G', 0, 0, -RWORK(i), & + ONE/DBLE(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL + ELSE IF ( ABS(Y(IZAMAX(M, Y(1,i),1),i )) & + /= ZERO ) THEN +! X(:,i) is zero vector. For consistency, +! Y(:,i) should also be zero. If Y(:,i) is not +! zero, then the data might be inconsistent or +! corrupted. If JOBS == 'C', Y(:,i) is set to +! zero and a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + IF ( LSAME(JOBS,'C')) & + CALL ZDSCAL( M, ZERO, Y(1,i), 1 ) ! BLAS CALL + END IF + END DO + END IF + ! + IF ( SCCOLY ) THEN + ! The columns of Y will be normalized. + ! To prevent overflows, the column norms of Y are + ! carefully computed using ZLASSQ. + DO i = 1, N + !RWORK(i) = DZNRM2( M, Y(1,i), 1 ) + SSUM = ONE + SCALE = ZERO + CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM ) + IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN + K = 0 + INFO = -10 + CALL XERBLA('ZGEDMD',-INFO) + END IF + IF ( SCALE /= ZERO .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of Y(:,i) overflows. First, Y(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. +! Next, the norm of Y(:,i) is stored without +! overflow as RWORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of Y(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, Y(1,i), LDY, INFO2 ) + RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) ) + ELSE +! Y(:,i) will be scaled to unit 2-norm + RWORK(i) = SCALE * ROOTSC + CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, & + Y(1,i), LDY, INFO2 ) ! LAPACK CALL +! Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC + END IF + ELSE + RWORK(i) = ZERO + END IF + END DO + DO i = 1, N +! Now, apply the same scaling to the columns of X. + IF ( RWORK(i) > ZERO ) THEN + CALL ZDSCAL( M, ONE/RWORK(i), X(1,i), 1 ) ! BLAS CALL +! X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC + ELSE IF ( RWORK(i) < ZERO ) THEN + CALL ZLASCL( 'G', 0, 0, -RWORK(i), & + ONE/DBLE(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL + ELSE IF ( ABS(X(IZAMAX(M, X(1,i),1),i )) & + /= ZERO ) THEN +! Y(:,i) is zero vector. If X(:,i) is not +! zero, then a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + END IF + END DO + END IF +! +! <2> SVD of the data snapshot matrix X. +! ===================================== +! The left singular vectors are stored in the array X. +! The right singular vectors are in the array W. +! The array W will later on contain the eigenvectors +! of a Rayleigh quotient. + NUMRNK = N + SELECT CASE ( WHTSVD ) + CASE (1) + CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, & + LDB, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 ) ! LAPACK CALL + T_OR_N = 'C' + CASE (2) + CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, & + LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 ) ! LAPACK CALL + T_OR_N = 'C' + CASE (3) + CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, RWORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, ZWORK, & + LZWORK, RWORK(N+1), LRWORK-N, INFO1) ! LAPACK CALL + CALL ZLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'C' + CASE (4) + CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, & + N, X, LDX, RWORK, Z, LDZ, W, LDW, & + ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 ) ! LAPACK CALL + CALL ZLACPY( 'A', M, N, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'N' + XSCL1 = RWORK(N+1) + XSCL2 = RWORK(N+2) + IF ( XSCL1 /= XSCL2 ) THEN + ! This is an exceptional situation. If the + ! data matrices are not scaled and the + ! largest singular value of X overflows. + ! In that case ZGEJSV can return the SVD + ! in scaled form. The scaling factor can be used + ! to rescale the data (X and Y). + CALL ZLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2 ) + END IF + END SELECT +! + IF ( INFO1 > 0 ) THEN + ! The SVD selected subroutine did not converge. + ! Return with an error code. + INFO = 2 + RETURN + END IF +! + IF ( RWORK(1) == ZERO ) THEN + ! The largest computed singular value of (scaled) + ! X is zero. Return error code -8 + ! (the 8th input variable had an illegal value). + K = 0 + INFO = -8 + CALL XERBLA('ZGEDMD',-INFO) + RETURN + END IF +! + !<3> Determine the numerical rank of the data + ! snapshots matrix X. This depends on the + ! parameters NRNK and TOL. + + SELECT CASE ( NRNK ) + CASE ( -1 ) + K = 1 + DO i = 2, NUMRNK + IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. & + ( RWORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE ( -2 ) + K = 1 + DO i = 1, NUMRNK-1 + IF ( ( RWORK(i+1) <= RWORK(i)*TOL ) .OR. & + ( RWORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE DEFAULT + K = 1 + DO i = 2, NRNK + IF ( RWORK(i) <= SMALL ) EXIT + K = K + 1 + END DO + END SELECT + ! Now, U = X(1:M,1:K) is the SVD/POD basis for the + ! snapshot data in the input matrix X. + + !<4> Compute the Rayleigh quotient S = U^H * A * U. + ! Depending on the requested outputs, the computation + ! is organized to compute additional auxiliary + ! matrices (for the residuals and refinements). + ! + ! In all formulas below, we need V_k*Sigma_k^(-1) + ! where either V_k is in W(1:N,1:K), or V_k^H is in + ! W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). + IF ( LSAME(T_OR_N, 'N') ) THEN + DO i = 1, K + CALL ZDSCAL( N, ONE/RWORK(i), W(1,i), 1 ) ! BLAS CALL + ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i) ! INTRINSIC + END DO + ELSE + ! This non-unit stride access is due to the fact + ! that ZGESVD, ZGESVDQ and ZGESDD return the + ! adjoint matrix of the right singular vectors. + !DO i = 1, K + ! CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW ) ! BLAS CALL + ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N) ! INTRINSIC + !END DO + DO i = 1, K + RWORK(N+i) = ONE/RWORK(i) + END DO + DO j = 1, N + DO i = 1, K + W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j) + END DO + END DO + END IF +! + IF ( WNTREF ) THEN + ! + ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) + ! for computing the refined Ritz vectors + ! (optionally, outside ZGEDMD). + CALL ZGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, & + LDW, ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='C' + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRINSIC, for T_OR_N=='N' + ! + ! At this point Z contains + ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and + ! this is needed for computing the residuals. + ! This matrix is returned in the array B and + ! it can be used to compute refined Ritz vectors. + CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) ! BLAS CALL + ! B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC + + CALL ZGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, & + LDZ, ZZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) ! INTRINSIC + ! At this point S = U^H * A * U is the Rayleigh quotient. + ELSE + ! A * U(:,1:K) is not explicitly needed and the + ! computation is organized differently. The Rayleigh + ! quotient is computed more efficiently. + CALL ZGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, & + ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) ) ! INTRINSIC + ! + CALL ZGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, & + LDW, ZZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='T' + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRINSIC, for T_OR_N=='N' + ! At this point S = U^H * A * U is the Rayleigh quotient. + ! If the residuals are requested, save scaled V_k into Z. + ! Recall that V_k or V_k^H is stored in W. + IF ( WNTRES .OR. WNTEX ) THEN + IF ( LSAME(T_OR_N, 'N') ) THEN + CALL ZLACPY( 'A', N, K, W, LDW, Z, LDZ ) + ELSE + CALL ZLACPY( 'A', K, N, W, LDW, Z, LDZ ) + END IF + END IF + END IF +! + !<5> Compute the Ritz values and (if requested) the + ! right eigenvectors of the Rayleigh quotient. + ! + CALL ZGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, LDW, & + W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 ) ! LAPACK CALL + ! + ! W(1:K,1:K) contains the eigenvectors of the Rayleigh + ! quotient. See the description of Z. + ! Also, see the description of ZGEEV. + IF ( INFO1 > 0 ) THEN + ! ZGEEV failed to compute the eigenvalues and + ! eigenvectors of the Rayleigh quotient. + INFO = 3 + RETURN + END IF +! + ! <6> Compute the eigenvectors (if requested) and, + ! the residuals (if requested). + ! + IF ( WNTVEC .OR. WNTEX ) THEN + IF ( WNTRES ) THEN + IF ( WNTREF ) THEN + ! Here, if the refinement is requested, we have + ! A*U(:,1:K) already computed and stored in Z. + ! For the residuals, need Y = A * U(:,1;K) * W. + CALL ZGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, & + LDW, ZZERO, Y, LDY ) ! BLAS CALL + ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC + ! This frees Z; Y contains A * U(:,1:K) * W. + ELSE + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z + CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, & + W, LDW, ZZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + LDS, ZZERO, Z, LDZ ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + CALL ZLACPY( 'A', M, K, Z, LDZ, Y, LDY ) + IF ( WNTEX ) CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF + ELSE IF ( WNTEX ) THEN + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, & + W, LDW, ZZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + LDS, ZZERO, B, LDB ) + ! The above call replaces the following two calls + ! that were used in the developing-testing phase. + ! CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + ! LDS, ZZERO, Z, LDZ) + ! Save a copy of Z into B and free Z for holding + ! the Ritz vectors. + ! CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF +! + ! Compute the Ritz vectors + IF ( WNTVEC ) CALL ZGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, & + ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC +! + IF ( WNTRES ) THEN + DO i = 1, K + CALL ZAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 ) ! BLAS CALL + ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i) ! INTRINSIC + RES(i) = DZNRM2( M, Y(1,i), 1 ) ! BLAS CALL + END DO + END IF + END IF +! + IF ( WHTSVD == 4 ) THEN + RWORK(N+1) = XSCL1 + RWORK(N+2) = XSCL2 + END IF +! +! Successful exit. + IF ( .NOT. BADXY ) THEN + INFO = 0 + ELSE + ! A warning on possible data inconsistency. + ! This should be a rare event. + INFO = 4 + END IF +!............................................................ + RETURN +! ...... + END SUBROUTINE ZGEDMD + diff --git a/lapack-netlib/SRC/zgedmdq.c b/lapack-netlib/SRC/zgedmdq.c new file mode 100644 index 000000000..1815f0814 --- /dev/null +++ b/lapack-netlib/SRC/zgedmdq.c @@ -0,0 +1,1293 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0 */ +/* The state space dimension (the number of rows of F). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshots from a single trajectory, */ +/* taken at equidistant discrete times. This is the */ +/* number of columns of F. */ +/* ..... */ +/* F (input/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, */ +/* the columns of F are the sequence of data snapshots */ +/* from a single trajectory, taken at equidistant discrete */ +/* times. It is assumed that the column norms of F are */ +/* in the range of the normalized floating point numbers. */ +/* < On exit, */ +/* If JOBQ == 'Q', the array F contains the orthogonal */ +/* matrix/factor of the QR factorization of the initial */ +/* data snapshots matrix F. See the description of JOBQ. */ +/* If JOBQ == 'N', the entries in F strictly below the main */ +/* diagonal contain, column-wise, the information on the */ +/* Householder vectors, as returned by ZGEQRF. The */ +/* remaining information to restore the orthogonal matrix */ +/* of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */ +/* See the description of ZWORK. */ +/* ..... */ +/* LDF (input) INTEGER, LDF >= M */ +/* The leading dimension of the array F. */ +/* ..... */ +/* X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* X is used as workspace to hold representations of the */ +/* leading N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, the leading K columns of X contain the leading */ +/* K left singular vectors of the above described content */ +/* of X. To lift them to the space of the left singular */ +/* vectors U(:,1:K) of the input data, pre-multiply with the */ +/* Q factor from the initial QR factorization. */ +/* See the descriptions of F, K, V and Z. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= N */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */ +/* Y is used as workspace to hold representations of the */ +/* trailing N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, */ +/* If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */ +/* triangular factor from the QR factorization of the data */ +/* snapshot matrix F. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= N */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N-1 :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the description of K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the SVD/POD basis for the leading N-1 */ +/* data snapshots (columns of F) and the number of the */ +/* computed Ritz pairs. The value of K is determined */ +/* according to the rule set by the parameters NRNK and */ +/* TOL. See the descriptions of NRNK and TOL. */ +/* ..... */ +/* EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K<=N-1) entries of EIGS contain */ +/* the computed eigenvalues (Ritz values). */ +/* See the descriptions of K, and Z. */ +/* ..... */ +/* Z (workspace/output) COMPLEX(KIND=WP) M-by-(N-1) array */ +/* If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */ +/* is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */ +/* Z*V, where Z contains orthonormal matrix (the product of */ +/* Q from the initial QR factorization and the SVD/POD_basis */ +/* returned by ZGEDMD in X) and the second factor (the */ +/* eigenvectors of the Rayleigh quotient) is in the array V, */ +/* as returned by ZGEDMD. That is, X(:,1:K)*V(:,i) */ +/* is an eigenvector corresponding to EIGS(i). The columns */ +/* of V(1:K,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. */ +/* See the descriptions of EIGS, X and V. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs, */ +/* RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */ +/* See the description of EIGS and Z. */ +/* ..... */ +/* B (output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array. */ +/* IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:N,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* In both cases, the content of B can be lifted to the */ +/* original dimension of the input data by pre-multiplying */ +/* with the Q factor from the initial QR factorization. */ +/* Here A denotes a compression of the underlying operator. */ +/* See the descriptions of F and X. */ +/* If JOBF =='N', then B is not referenced. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= MIN(M,N) */ +/* The leading dimension of the array B. */ +/* ..... */ +/* V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */ +/* On exit, V(1:K,1:K) V contains the K eigenvectors of */ +/* the Rayleigh quotient. The Ritz vectors */ +/* (returned in Z) are the product of Q from the initial QR */ +/* factorization (see the description of F) X (see the */ +/* description of X) and V. */ +/* ..... */ +/* LDV (input) INTEGER, LDV >= N-1 */ +/* The leading dimension of the array V. */ +/* ..... */ +/* S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by ZGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N-1 */ +/* The leading dimension of the array S. */ +/* ..... */ +/* ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* ZWORK(1:MIN(M,N)) contains the scalar factors of the */ +/* elementary reflectors as returned by ZGEQRF of the */ +/* M-by-N input matrix F. */ +/* If the call to ZGEDMDQ is only workspace query, then */ +/* ZWORK(1) contains the minimal complex workspace length and */ +/* ZWORK(2) is the optimal complex workspace length. */ +/* Hence, the length of work is at least 2. */ +/* See the description of LZWORK. */ +/* ..... */ +/* LZWORK (input) INTEGER */ +/* The minimal length of the workspace vector ZWORK. */ +/* LZWORK is calculated as follows: */ +/* Let MLWQR = N (minimal workspace for ZGEQRF[M,N]) */ +/* MLWDMD = minimal workspace for ZGEDMD (see the */ +/* description of LWORK in ZGEDMD) */ +/* MLWMQR = N (minimal workspace for */ +/* ZUNMQR['L','N',M,N,N]) */ +/* MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */ +/* MINMN = MIN(M,N) */ +/* Then */ +/* LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */ +/* is further updated as follows: */ +/* if JOBZ == 'V' or JOBZ == 'F' THEN */ +/* LZWORK = MAX(LZWORK, MINMN+MLWMQR) */ +/* if JOBQ == 'Q' THEN */ +/* LZWORK = MAX(ZLWORK, MINMN+MLWGQR) */ + +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* WORK(1:N-1) contains the singular values of */ +/* the input submatrix F(1:M,1:N-1). */ +/* If the call to ZGEDMDQ is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is the same as in ZGEDMD, because in ZGEDMDQ */ +/* only ZGEDMD requires real workspace for snapshots */ +/* of dimensions MIN(M,N)-by-(N-1). */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace length for WORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* Let M1=MIN(M,N), N1=N-1. Then */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) */ + +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ + +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* External subroutines */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* .......................................................... */ + /* Parameter adjustments */ + f_dim1 = *ldf; + f_offset = 1 + f_dim1 * 1; + f -= f_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --eigs; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + v_dim1 = *ldv; + v_offset = 1 + v_dim1 * 1; + v -= v_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --zwork; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; + zzero.r = 0.f, zzero.i = 0.f; + +/* Test the input arguments */ + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntvcf = lsame_(jobz, "F"); + wntvcq = lsame_(jobz, "Q"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + wantq = lsame_(jobq, "Q"); + wnttrf = lsame_(jobt, "R"); + minmn = f2cmin(*m,*n); + *info = 0; + lquery = *lzwork == -1 || *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && lsame_(jobz, "N")) { + *info = -3; + } else if (! (wantq || lsame_(jobq, "N"))) { + *info = -4; + } else if (! (wnttrf || lsame_(jobt, "N"))) { + *info = -5; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -6; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -7; + } else if (*m < 0) { + *info = -8; + } else if (*n < 0 || *n > *m + 1) { + *info = -9; + } else if (*ldf < *m) { + *info = -11; + } else if (*ldx < minmn) { + *info = -13; + } else if (*ldy < minmn) { + *info = -15; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -16; + } else if (*tol < zero || *tol >= one) { + *info = -17; + } else if (*ldz < *m) { + *info = -21; + } else if ((wntref || wntex) && *ldb < minmn) { + *info = -24; + } else if (*ldv < *n - 1) { + *info = -26; + } else if (*lds < *n - 1) { + *info = -28; + } + + if (wntvec || wntvcf || wntvcq) { + *(unsigned char *)jobvl = 'V'; + } else { + *(unsigned char *)jobvl = 'N'; + } + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0 || *n == 1) { +/* All output except K is void. INFO=1 signals */ +/* the void input. In case of a workspace query, */ +/* the minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + zwork[1].r = 2., zwork[1].i = 0.; + zwork[2].r = 2., zwork[2].i = 0.; + work[1] = 2.; + work[2] = 2.; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlrwrk = 2; + mlwork = 2; + olwork = 2; + iminwr = 1; + mlwqr = f2cmax(1,*n); +/* Minimal workspace length for ZGEQRF. */ +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, & + info1); + olwqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwqr; + olwork = f2cmax(i__1,i__2); + } + i__1 = *n - 1; + zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], + ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset] + , ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[ + s_offset], lds, &zwork[1], &c_n1, &work[1], &c_n1, &iwork[1], + &c_n1, &info1); + mlwdmd = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwdmd; + mlwork = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = (integer) work[1]; + mlrwrk = f2cmax(i__1,i__2); + iminwr = f2cmax(iminwr,iwork[1]); + if (lquery) { + olwdmd = (integer) zwork[2].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwdmd; + olwork = f2cmax(i__1,i__2); + } + if (wntvec || wntvcf) { + mlwmqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwmqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + zunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], + &z__[z_offset], ldz, &zwork[1], &c_n1, &info1); + olwmqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwmqr; + olwork = f2cmax(i__1,i__2); + } + } + if (wantq) { + mlwgqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwgqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], & + zwork[1], &c_n1, &info1); + olwgqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwgqr; + olwork = f2cmax(i__1,i__2); + } + } + if (*liwork < iminwr && ! lquery) { + *info = -34; + } + if (*lwork < mlrwrk && ! lquery) { + *info = -32; + } + if (*lzwork < mlwork && ! lquery) { + *info = -30; + } + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGEDMDQ", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.; + zwork[2].r = (doublereal) olwork, zwork[2].i = 0.; + work[1] = (doublereal) mlrwrk; + work[2] = (doublereal) mlrwrk; + return 0; + } +/* ..... */ +/* Initial QR factorization that is used to represent the */ +/* snapshots as elements of lower dimensional subspace. */ +/* For large scale computation with M >> N, at this place */ +/* one can use an out of core QRF. */ + + i__1 = *lzwork - minmn; + zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, & + info1); + +/* Define X and Y as the snapshots representations in the */ +/* orthogonal basis computed in the QR factorization. */ +/* X corresponds to the leading N-1 and Y to the trailing */ +/* N-1 snapshots. */ + i__1 = *n - 1; + zlaset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx); + i__1 = *n - 1; + zlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx); + i__1 = *n - 1; + zlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy); + if (*m >= 3) { + i__1 = minmn - 2; + i__2 = *n - 2; + zlaset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy); + } + +/* Compute the DMD of the projected snapshot pairs (X,Y) */ + i__1 = *n - 1; + i__2 = *lzwork - minmn; + zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx, + &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, & + res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, & + zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, & + info1); + if (info1 == 2 || info1 == 3) { +/* Return with error code. See ZGEDMD for details. */ + *info = info1; + return 0; + } else { + *info = info1; + } + +/* The Ritz vectors (Koopman modes) can be explicitly */ +/* formed or returned in factored form. */ + if (wntvec) { +/* Compute the eigenvectors explicitly. */ + if (*m > minmn) { + i__1 = *m - minmn; + zlaset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], + ldz); + } + i__1 = *lzwork - minmn; + zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[ + z_offset], ldz, &zwork[minmn + 1], &i__1, &info1); + } else if (wntvcf) { +/* Return the Ritz vectors (eigenvectors) in factored */ +/* form Z*V, where Z contains orthonormal matrix (the */ +/* product of Q from the initial QR factorization and */ +/* the SVD/POD_basis returned by ZGEDMD in X) and the */ +/* second factor (the eigenvectors of the Rayleigh */ +/* quotient) is in the array V, as returned by ZGEDMD. */ + zlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz); + if (*m > *n) { + i__1 = *m - *n; + zlaset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz); + } + i__1 = *lzwork - minmn; + zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[ + z_offset], ldz, &zwork[minmn + 1], &i__1, &info1); + } + +/* Some optional output variables: */ + +/* The upper triangular factor R in the initial QR */ +/* factorization is optionally returned in the array Y. */ +/* This is useful if this call to ZGEDMDQ is to be */ +/* followed by a streaming DMD that is implemented in a */ +/* QR compressed form. */ + if (wnttrf) { +/* Return the upper triangular R in Y */ + zlaset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy); + zlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy); + } + +/* The orthonormal/unitary factor Q in the initial QR */ +/* factorization is optionally returned in the array F. */ +/* Same as with the triangular factor above, this is */ +/* useful in a streaming DMD. */ + if (wantq) { +/* Q overwrites F */ + i__1 = *lzwork - minmn; + zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn + + 1], &i__1, &info1); + } + + return 0; + +} /* zgedmdq_ */ + diff --git a/lapack-netlib/SRC/zgedmdq.f90 b/lapack-netlib/SRC/zgedmdq.f90 new file mode 100644 index 000000000..51be72a32 --- /dev/null +++ b/lapack-netlib/SRC/zgedmdq.f90 @@ -0,0 +1,689 @@ +SUBROUTINE ZGEDMDQ( JOBS, JOBZ, JOBR, JOBQ, JOBT, JOBF, & + WHTSVD, M, N, F, LDF, X, LDX, Y, & + LDY, NRNK, TOL, K, EIGS, & + Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK, LZWORK, WORK, LWORK, & + IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBQ, & + JOBT, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDF, LDX, & + LDY, NRNK, LDZ, LDB, LDV, & + LDS, LZWORK, LWORK, LIWORK + INTEGER, INTENT(OUT) :: INFO, K + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*) + COMPLEX(KIND=WP), INTENT(OUT) :: X(LDX,*), Y(LDY,*), & + Z(LDZ,*), B(LDB,*), & + V(LDV,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!..... +! Purpose +! ======= +! ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices, using a QR factorization +! based compression of the data. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, ZGEDMDQ computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, ZGEDMDQ returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office. +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. The data snapshots are the columns +! of F. The leading N-1 columns of F are denoted X and the +! trailing N-1 columns are denoted Y. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Z*V, where Z +! is orthonormal and V contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of F, V, Z. +! 'Q' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Q*Z, where Z +! contains the eigenvectors of the compression of the +! underlying discretized operator onto the span of +! the data snapshots. See the descriptions of F, V, Z. +! Q is from the initial QR factorization. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will +! be computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBQ (input) CHARACTER*1 +! Specifies whether to explicitly compute and return the +! unitary matrix from the QR factorization. +! 'Q' :: The matrix Q of the QR factorization of the data +! snapshot matrix is computed and stored in the +! array F. See the description of F. +! 'N' :: The matrix Q is not explicitly computed. +!..... +! JOBT (input) CHARACTER*1 +! Specifies whether to return the upper triangular factor +! from the QR factorization. +! 'R' :: The matrix R of the QR factorization of the data +! snapshot matrix F is returned in the array Y. +! See the description of Y and Further details. +! 'N' :: The matrix R is not returned. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +! To be useful on exit, this option needs JOBQ='Q'. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: ZGESVD (the QR SVD algorithm) +! 2 :: ZGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M >= 0 +! The state space dimension (the number of rows of F). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshots from a single trajectory, +! taken at equidistant discrete times. This is the +! number of columns of F. +!..... +! F (input/output) COMPLEX(KIND=WP) M-by-N array +! > On entry, +! the columns of F are the sequence of data snapshots +! from a single trajectory, taken at equidistant discrete +! times. It is assumed that the column norms of F are +! in the range of the normalized floating point numbers. +! < On exit, +! If JOBQ == 'Q', the array F contains the orthogonal +! matrix/factor of the QR factorization of the initial +! data snapshots matrix F. See the description of JOBQ. +! If JOBQ == 'N', the entries in F strictly below the main +! diagonal contain, column-wise, the information on the +! Householder vectors, as returned by ZGEQRF. The +! remaining information to restore the orthogonal matrix +! of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). +! See the description of ZWORK. +!..... +! LDF (input) INTEGER, LDF >= M +! The leading dimension of the array F. +!..... +! X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array +! X is used as workspace to hold representations of the +! leading N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, the leading K columns of X contain the leading +! K left singular vectors of the above described content +! of X. To lift them to the space of the left singular +! vectors U(:,1:K) of the input data, pre-multiply with the +! Q factor from the initial QR factorization. +! See the descriptions of F, K, V and Z. +!..... +! LDX (input) INTEGER, LDX >= N +! The leading dimension of the array X. +!..... +! Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array +! Y is used as workspace to hold representations of the +! trailing N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, +! If JOBT == 'R', Y contains the MIN(M,N)-by-N upper +! triangular factor from the QR factorization of the data +! snapshot matrix F. +!..... +! LDY (input) INTEGER , LDY >= N +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N-1 :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the description of K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the SVD/POD basis for the leading N-1 +! data snapshots (columns of F) and the number of the +! computed Ritz pairs. The value of K is determined +! according to the rule set by the parameters NRNK and +! TOL. See the descriptions of NRNK and TOL. +!..... +! EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array +! The leading K (K<=N-1) entries of EIGS contain +! the computed eigenvalues (Ritz values). +! See the descriptions of K, and Z. +!..... +! Z (workspace/output) COMPLEX(KIND=WP) M-by-(N-1) array +! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +! Z*V, where Z contains orthonormal matrix (the product of +! Q from the initial QR factorization and the SVD/POD_basis +! returned by ZGEDMD in X) and the second factor (the +! eigenvectors of the Rayleigh quotient) is in the array V, +! as returned by ZGEDMD. That is, X(:,1:K)*V(:,i) +! is an eigenvector corresponding to EIGS(i). The columns +! of V(1:K,1:K) are the computed eigenvectors of the +! K-by-K Rayleigh quotient. +! See the descriptions of EIGS, X and V. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) (N-1)-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs, +! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +! See the description of EIGS and Z. +!..... +! B (output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array. +! IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:N,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! In both cases, the content of B can be lifted to the +! original dimension of the input data by pre-multiplying +! with the Q factor from the initial QR factorization. +! Here A denotes a compression of the underlying operator. +! See the descriptions of F and X. +! If JOBF =='N', then B is not referenced. +!..... +! LDB (input) INTEGER, LDB >= MIN(M,N) +! The leading dimension of the array B. +!..... +! V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array +! On exit, V(1:K,1:K) V contains the K eigenvectors of +! the Rayleigh quotient. The Ritz vectors +! (returned in Z) are the product of Q from the initial QR +! factorization (see the description of F) X (see the +! description of X) and V. +!..... +! LDV (input) INTEGER, LDV >= N-1 +! The leading dimension of the array V. +!..... +! S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by ZGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N-1 +! The leading dimension of the array S. +!..... +! ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array +! On exit, +! ZWORK(1:MIN(M,N)) contains the scalar factors of the +! elementary reflectors as returned by ZGEQRF of the +! M-by-N input matrix F. +! If the call to ZGEDMDQ is only workspace query, then +! ZWORK(1) contains the minimal complex workspace length and +! ZWORK(2) is the optimal complex workspace length. +! Hence, the length of work is at least 2. +! See the description of LZWORK. +!..... +! LZWORK (input) INTEGER +! The minimal length of the workspace vector ZWORK. +! LZWORK is calculated as follows: +! Let MLWQR = N (minimal workspace for ZGEQRF[M,N]) +! MLWDMD = minimal workspace for ZGEDMD (see the +! description of LWORK in ZGEDMD) +! MLWMQR = N (minimal workspace for +! ZUNMQR['L','N',M,N,N]) +! MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) +! MINMN = MIN(M,N) +! Then +! LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) +! is further updated as follows: +! if JOBZ == 'V' or JOBZ == 'F' THEN +! LZWORK = MAX(LZWORK, MINMN+MLWMQR) +! if JOBQ == 'Q' THEN +! LZWORK = MAX(ZLWORK, MINMN+MLWGQR) +! +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, +! WORK(1:N-1) contains the singular values of +! the input submatrix F(1:M,1:N-1). +! If the call to ZGEDMDQ is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is the same as in ZGEDMD, because in ZGEDMDQ +! only ZGEDMD requires real workspace for snapshots +! of dimensions MIN(M,N)-by-(N-1). +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace length for WORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! Let M1=MIN(M,N), N1=N-1. Then +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +! +! Local scalars +! ~~~~~~~~~~~~~ + INTEGER :: IMINWR, INFO1, MINMN, MLRWRK, & + MLWDMD, MLWGQR, MLWMQR, MLWORK, & + MLWQR, OLWDMD, OLWGQR, OLWMQR, & + OLWORK, OLWQR + LOGICAL :: LQUERY, SCCOLX, SCCOLY, WANTQ, & + WNTTRF, WNTRES, WNTVEC, WNTVCF, & + WNTVCQ, WNTREF, WNTEX + CHARACTER(LEN=1) :: JOBVL +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + LOGICAL LSAME + EXTERNAL LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL ZGEQRF, ZLACPY, ZLASET, ZUNGQR, & + ZUNMQR, XERBLA + +! External subroutines +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL ZGEDMD + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC MAX, MIN, INT + !.......................................................... + ! + ! Test the input arguments + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' ) + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTVCF = LSAME(JOBZ,'F') + WNTVCQ = LSAME(JOBZ,'Q') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + WANTQ = LSAME(JOBQ,'Q') + WNTTRF = LSAME(JOBT,'R') + MINMN = MIN(M,N) + INFO = 0 + LQUERY = ( (LZWORK == -1) .OR. (LWORK == -1) .OR. (LIWORK == -1) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ & + .OR. LSAME(JOBZ,'N')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN + INFO = -4 + ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) ) THEN + INFO = -5 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -6 + ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR. & + (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN + INFO = -7 + ELSE IF ( M < 0 ) THEN + INFO = -8 + ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN + INFO = -9 + ELSE IF ( LDF < M ) THEN + INFO = -11 + ELSE IF ( LDX < MINMN ) THEN + INFO = -13 + ELSE IF ( LDY < MINMN ) THEN + INFO = -15 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -16 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -17 + ELSE IF ( LDZ < M ) THEN + INFO = -21 + ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN + INFO = -24 + ELSE IF ( LDV < N-1 ) THEN + INFO = -26 + ELSE IF ( LDS < N-1 ) THEN + INFO = -28 + END IF +! + IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN + JOBVL = 'V' + ELSE + JOBVL = 'N' + END IF + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN + ! All output except K is void. INFO=1 signals + ! the void input. In case of a workspace query, + ! the minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + ZWORK(1) = 2 + ZWORK(2) = 2 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + + MLRWRK = 2 + MLWORK = 2 + OLWORK = 2 + IMINWR = 1 + MLWQR = MAX(1,N) ! Minimal workspace length for ZGEQRF. + MLWORK = MAX(MLWORK,MINMN + MLWQR) + + IF ( LQUERY ) THEN + CALL ZGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, & + INFO1 ) + OLWQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,MINMN + OLWQR) + END IF + CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + EIGS, Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK, -1, WORK, -1, IWORK,& + -1, INFO1 ) + MLWDMD = INT(ZWORK(1)) + MLWORK = MAX(MLWORK, MINMN + MLWDMD) + MLRWRK = MAX(MLRWRK, INT(WORK(1))) + IMINWR = MAX(IMINWR, IWORK(1)) + IF ( LQUERY ) THEN + OLWDMD = INT(ZWORK(2)) + OLWORK = MAX(OLWORK, MINMN+OLWDMD) + END IF + IF ( WNTVEC .OR. WNTVCF ) THEN + MLWMQR = MAX(1,N) + MLWORK = MAX(MLWORK,MINMN+MLWMQR) + IF ( LQUERY ) THEN + CALL ZUNMQR( 'L','N', M, N, MINMN, F, LDF, & + ZWORK, Z, LDZ, ZWORK, -1, INFO1 ) + OLWMQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,MINMN+OLWMQR) + END IF + END IF + IF ( WANTQ ) THEN + MLWGQR = MAX(1,N) + MLWORK = MAX(MLWORK,MINMN+MLWGQR) + IF ( LQUERY ) THEN + CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, & + ZWORK, -1, INFO1 ) + OLWGQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,MINMN+OLWGQR) + END IF + END IF + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34 + IF ( LWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32 + IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30 + END IF + IF( INFO /= 0 ) THEN + CALL XERBLA( 'ZGEDMDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + ZWORK(1) = MLWORK + ZWORK(2) = OLWORK + WORK(1) = MLRWRK + WORK(2) = MLRWRK + RETURN + END IF +!..... +! Initial QR factorization that is used to represent the +! snapshots as elements of lower dimensional subspace. +! For large scale computation with M >> N, at this place +! one can use an out of core QRF. +! + CALL ZGEQRF( M, N, F, LDF, ZWORK, & + ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) +! +! Define X and Y as the snapshots representations in the +! orthogonal basis computed in the QR factorization. +! X corresponds to the leading N-1 and Y to the trailing +! N-1 snapshots. + CALL ZLASET( 'L', MINMN, N-1, ZZERO, ZZERO, X, LDX ) + CALL ZLACPY( 'U', MINMN, N-1, F, LDF, X, LDX ) + CALL ZLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY ) + IF ( M >= 3 ) THEN + CALL ZLASET( 'L', MINMN-2, N-2, ZZERO, ZZERO, & + Y(3,1), LDY ) + END IF +! +! Compute the DMD of the projected snapshot pairs (X,Y) + CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, & + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + EIGS, Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK(MINMN+1), LZWORK-MINMN, & + WORK, LWORK, IWORK, LIWORK, INFO1 ) + IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN + ! Return with error code. See ZGEDMD for details. + INFO = INFO1 + RETURN + ELSE + INFO = INFO1 + END IF +! +! The Ritz vectors (Koopman modes) can be explicitly +! formed or returned in factored form. + IF ( WNTVEC ) THEN + ! Compute the eigenvectors explicitly. + IF ( M > MINMN ) CALL ZLASET( 'A', M-MINMN, K, ZZERO, & + ZZERO, Z(MINMN+1,1), LDZ ) + CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, & + LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + ELSE IF ( WNTVCF ) THEN + ! Return the Ritz vectors (eigenvectors) in factored + ! form Z*V, where Z contains orthonormal matrix (the + ! product of Q from the initial QR factorization and + ! the SVD/POD_basis returned by ZGEDMD in X) and the + ! second factor (the eigenvectors of the Rayleigh + ! quotient) is in the array V, as returned by ZGEDMD. + CALL ZLACPY( 'A', N, K, X, LDX, Z, LDZ ) + IF ( M > N ) CALL ZLASET( 'A', M-N, K, ZZERO, ZZERO, & + Z(N+1,1), LDZ ) + CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, & + LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + END IF +! +! Some optional output variables: +! +! The upper triangular factor R in the initial QR +! factorization is optionally returned in the array Y. +! This is useful if this call to ZGEDMDQ is to be +! followed by a streaming DMD that is implemented in a +! QR compressed form. + IF ( WNTTRF ) THEN ! Return the upper triangular R in Y + CALL ZLASET( 'A', MINMN, N, ZZERO, ZZERO, Y, LDY ) + CALL ZLACPY( 'U', MINMN, N, F, LDF, Y, LDY ) + END IF +! +! The orthonormal/unitary factor Q in the initial QR +! factorization is optionally returned in the array F. +! Same as with the triangular factor above, this is +! useful in a streaming DMD. + IF ( WANTQ ) THEN ! Q overwrites F + CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, & + ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + END IF +! + RETURN +! + END SUBROUTINE ZGEDMDQ + \ No newline at end of file diff --git a/lapack-netlib/SRC/zgehrd.f b/lapack-netlib/SRC/zgehrd.f index e18493cf9..0f4424ded 100644 --- a/lapack-netlib/SRC/zgehrd.f +++ b/lapack-netlib/SRC/zgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - COMPLEX*16 ZERO, ONE + COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - COMPLEX*16 EI + COMPLEX*16 EI * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZGEHD2, ZGEMM, ZLAHR2, ZLARFB, ZTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF WORK( 1 ) = LWKOPT ENDIF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index 5134ea891..b4bc531ab 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -52,10 +52,10 @@ *> are computed and stored in the arrays U and V, respectively. The diagonal *> of [SIGMA] is computed and stored in the array SVA. *> \endverbatim -*> -*> Arguments: -*> ========== -*> +* +* Arguments: +* ========== +* *> \param[in] JOBA *> \verbatim *> JOBA is CHARACTER*1 @@ -151,7 +151,7 @@ *> transposed A if A^* seems to be better with respect to convergence. *> If the matrix is not square, JOBT is ignored. *> The decision is based on two values of entropy over the adjoint -*> orbit of A^* * A. See the descriptions of WORK(6) and WORK(7). +*> orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7). *> = 'T': transpose if entropy test indicates possibly faster *> convergence of Jacobi process if A^* is taken as input. If A is *> replaced with A^*, then the row pivoting is included automatically. @@ -209,11 +209,11 @@ *> \verbatim *> SVA is DOUBLE PRECISION array, dimension (N) *> On exit, -*> - For WORK(1)/WORK(2) = ONE: The singular values of A. During the -*> computation SVA contains Euclidean column norms of the +*> - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During +*> the computation SVA contains Euclidean column norms of the *> iterated matrices in the array A. -*> - For WORK(1) .NE. WORK(2): The singular values of A are -*> (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if +*> - For RWORK(1) .NE. RWORK(2): The singular values of A are +*> (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if *> sigma_max(A) overflows or if small singular values have been *> saved from underflow by scaling the input matrix A. *> - If JOBR='R' then some of the singular values may be returned @@ -252,7 +252,7 @@ *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; *> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), -*> then V is used as workspace if the pprocedure +*> then V is used as workspace if the procedure *> replaces A with A^*. In that case, [U] is computed *> in V as right singular vectors of A^* and then *> copied back to the U array. This 'W' option is just @@ -1821,7 +1821,7 @@ IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 * (this overwrites the copy of R2, as it will not be -* needed in this branch, but it does not overwritte the +* needed in this branch, but it does not overwrite the * Huseholder vectors of Q2.). CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) * .. and the rest of the information on Q3 is in @@ -1844,7 +1844,7 @@ END IF * * Second preconditioning finished; continue with Jacobi SVD -* The input matrix is lower trinagular. +* The input matrix is lower triangular. * * Recover the right singular vectors as solution of a well * conditioned triangular matrix equation. @@ -1888,7 +1888,7 @@ ELSE IF ( CONDR2 .LT. COND_OK ) THEN * * The matrix R2 is inverted. The solution of the matrix equation -* is Q3^* * V3 = the product of the Jacobi rotations (appplied to +* is Q3^* * V3 = the product of the Jacobi rotations (applied to * the lower triangular L3 from the LQ factorization of * R2=L3*Q3), pre-multiplied with the transposed Q3. CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index de7c9a378..86610e801 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f index 6c295eece..e988ea818 100644 --- a/lapack-netlib/SRC/zgelqf.f +++ b/lapack-netlib/SRC/zgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/zgelqt3.f b/lapack-netlib/SRC/zgelqt3.f index 629a09472..1a71dc44e 100644 --- a/lapack-netlib/SRC/zgelqt3.f +++ b/lapack-netlib/SRC/zgelqt3.f @@ -174,7 +174,8 @@ * * Compute Householder transform when M=1 * - CALL ZLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL ZLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) T(1,1)=CONJG(T(1,1)) * ELSE diff --git a/lapack-netlib/SRC/zgelsd.f b/lapack-netlib/SRC/zgelsd.f index 01793e16c..b5bc768e8 100644 --- a/lapack-netlib/SRC/zgelsd.f +++ b/lapack-netlib/SRC/zgelsd.f @@ -60,12 +60,6 @@ *> singular values which are less than RCOND times the largest singular *> value. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f index be53ba95b..afdbaecf0 100644 --- a/lapack-netlib/SRC/zgelss.f +++ b/lapack-netlib/SRC/zgelss.f @@ -170,7 +170,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsolve +*> \ingroup gelss * * ===================================================================== SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, @@ -212,10 +212,9 @@ COMPLEX*16 DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, - $ ZDRSCL, ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, - $ ZLACPY, ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ, - $ ZUNMQR + EXTERNAL DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, ZDRSCL, + $ ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, ZLACPY, + $ ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ * .. * .. External Functions .. INTEGER ILAENV @@ -388,7 +387,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max element outside range [SMLNUM,BIGNUM] * @@ -540,7 +538,7 @@ $ LDB, CZERO, WORK, N ) CALL ZLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB ) 20 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL ZGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 ) CALL ZCOPY( N, WORK, 1, B, 1 ) END IF @@ -645,7 +643,7 @@ CALL ZLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ), $ LDB ) 40 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL ZGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ), $ 1, CZERO, WORK( IWORK ), 1 ) CALL ZCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 ) @@ -737,7 +735,7 @@ $ LDB, CZERO, WORK, N ) CALL ZLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB ) 60 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL ZGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 ) CALL ZCOPY( N, WORK, 1, B, 1 ) END IF diff --git a/lapack-netlib/SRC/zgelsy.f b/lapack-netlib/SRC/zgelsy.f index 65fa87ae9..497becf8b 100644 --- a/lapack-netlib/SRC/zgelsy.f +++ b/lapack-netlib/SRC/zgelsy.f @@ -116,6 +116,7 @@ *> B is COMPLEX*16 array, dimension (LDB,NRHS) *> On entry, the M-by-NRHS right hand side matrix B. *> On exit, the N-by-NRHS solution matrix X. +*> If M = 0 or N = 0, B is not referenced. *> \endverbatim *> *> \param[in] LDB @@ -148,6 +149,7 @@ *> The effective rank of A, i.e., the order of the submatrix *> R11. This is the same as the order of the submatrix T11 *> in the complete orthogonal factorization of A. +*> If NRHS = 0, RANK = 0 on output. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f index 41cd1c059..11489087a 100644 --- a/lapack-netlib/SRC/zgemlq.f +++ b/lapack-netlib/SRC/zgemlq.f @@ -109,16 +109,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -142,7 +143,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -158,11 +159,13 @@ *> block sizes MB and NB returned by ILAENV, ZGELQ will use either *> ZLASWLQ (if the matrix is wide-and-short) or ZGELQT to compute *> the LQ factorization. -*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to +*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in ZLAMSWLQ or ZGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE ZGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -184,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -200,7 +203,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -215,6 +218,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -243,7 +253,7 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * @@ -260,7 +270,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f index c83eaff2f..d14d74fe2 100644 --- a/lapack-netlib/SRC/zgemqr.f +++ b/lapack-netlib/SRC/zgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE ZGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/zgeqlf.f b/lapack-netlib/SRC/zgeqlf.f index 94721540c..a27612c64 100644 --- a/lapack-netlib/SRC/zgeqlf.f +++ b/lapack-netlib/SRC/zgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/zgeqp3rk.c b/lapack-netlib/SRC/zgeqp3rk.c new file mode 100644 index 000000000..0c8b41c2d --- /dev/null +++ b/lapack-netlib/SRC/zgeqp3rk.c @@ -0,0 +1,1074 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + return 0; + } + +/* ================================================================== */ + + eps = dlamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.) { + safmin = dlamch_("Safe minimum"); +/* Computing MAX */ + d__1 = *abstol, d__2 = safmin * 2.; + *abstol = f2cmax(d__1,d__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1. <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "ZGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + zlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], & + rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], + &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + zlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], & + work[1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &rwork[*k + 1], &c__1); + *maxc2nrmk = rwork[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + + return 0; + +/* End of ZGEQP3RK */ + +} /* zgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f new file mode 100644 index 000000000..01dcce0de --- /dev/null +++ b/lapack-netlib/SRC/zgeqp3rk.f @@ -0,0 +1,1092 @@ +*> \brief \b ZGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, RWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION RWORK( * ) +* COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a complex +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M unitary matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS +*> matrix B with Q(K)**H * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**H * B, the matrix B with the unitary +*> transformation Q(K)**H applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M unitary matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**H, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a complex scalar, +*> v is a complex vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> unitary matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**H * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. +*> For optimal performance LWORK >= NB*( N+NRHS+1 ), +*> where NB is the optimal block size for ZGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine ZLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> ZGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in ZGEQP3 routine which uses +*> ZLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, RWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION RWORK( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + DOUBLE PRECISION ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) + COMPLEX*16 CZERO + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + DOUBLE PRECISION EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL ZLAQP2RK, ZLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX, ILAENV + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, DZNRM2, IDAMAX, ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( DISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( DISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in ZLAQP2RK. +* 1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in ZLARF subroutine inside ZLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'ZGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code +* in ZLAQP3RK. +* 1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in ZLARF subroutine to apply an elementary reflector +* from the left. +* 3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) ZLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = DCMPLX( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + RWORK( J ) = DZNRM2( M, A( 1, J ), 1 ) + RWORK( N+J ) = RWORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = IDAMAX( N, RWORK( 1 ), 1 ) +* +* ==================================================================. +* + IF( DISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = DLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = CZERO + END DO + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = DLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = DLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'ZGEQP3RK', ' ', M, N, -1, -1 ) ) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'ZGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL ZLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), WORK( JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = DCMPLX( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL ZLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + IDAMAX( N-K, RWORK( K+1 ), 1 ) + MAXC2NRMK = RWORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = CZERO + END DO +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = DCMPLX( LWKOPT ) +* + RETURN +* +* End of ZGEQP3RK +* + END diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f index 20a80d083..7df9c2403 100644 --- a/lapack-netlib/SRC/zgeqr.f +++ b/lapack-netlib/SRC/zgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f index 73bcde667..3562de36e 100644 --- a/lapack-netlib/SRC/zgeqrfp.f +++ b/lapack-netlib/SRC/zgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEQR2P, ZLARFB, ZLARFT @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f index 30d18a3a0..7f203afa5 100644 --- a/lapack-netlib/SRC/zgesdd.f +++ b/lapack-netlib/SRC/zgesdd.f @@ -53,12 +53,6 @@ *> *> Note that the routine returns VT = V**H, not V. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zgesvdq.f b/lapack-netlib/SRC/zgesvdq.f index 05eb722f8..b990f7389 100644 --- a/lapack-netlib/SRC/zgesvdq.f +++ b/lapack-netlib/SRC/zgesvdq.f @@ -363,7 +363,7 @@ *> an optimal implementation would do all necessary scaling before calling *> CGESVD and the scaling in CGESVD can be switched off. *> 3. Other comments related to code optimization are given in comments in the -*> code, enlosed in [[double brackets]]. +*> code, enclosed in [[double brackets]]. *> \endverbatim * *> \par Bugs, examples and comments diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f index 6cd2335f2..2be45d826 100644 --- a/lapack-netlib/SRC/zgesvj.f +++ b/lapack-netlib/SRC/zgesvj.f @@ -200,23 +200,25 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', then LDV >= max(1,N). -*> If JOBV = 'A', then LDV >= max(1,MV) . +*> If JOBV = 'V', then LDV >= MAX(1,N). +*> If JOBV = 'A', then LDV >= MAX(1,MV) . *> \endverbatim *> *> \param[in,out] CWORK *> \verbatim -*> CWORK is COMPLEX*16 array, dimension (max(1,LWORK)) +*> CWORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise. +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -367,23 +371,25 @@ * * .. Local Parameters .. DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) - COMPLEX*16 CZERO, CONE - PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) - INTEGER NSWEEP - PARAMETER ( NSWEEP = 30 ) + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) + INTEGER NSWEEP + PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX*16 AAPQ, OMPQ - DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX*16 AAPQ, OMPQ + DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, + $ TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, + $ SWBAND, MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -422,7 +428,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M+N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +457,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( ( LWORK.LT.( M+N ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( ( LRWORK.LT.MAX( N, 6 ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +469,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/zgetf2.f b/lapack-netlib/SRC/zgetf2.f index c247f8645..7c63dbbee 100644 --- a/lapack-netlib/SRC/zgetf2.f +++ b/lapack-netlib/SRC/zgetf2.f @@ -101,7 +101,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup getf2 * * ===================================================================== SUBROUTINE ZGETF2( M, N, A, LDA, IPIV, INFO ) @@ -127,7 +127,7 @@ * .. * .. Local Scalars .. DOUBLE PRECISION SFMIN - INTEGER I, J, JP + INTEGER J, JP * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -135,7 +135,7 @@ EXTERNAL DLAMCH, IZAMAX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZGERU, ZSCAL, ZSWAP + EXTERNAL XERBLA, ZGERU, ZRSCL, ZSWAP * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -181,15 +181,8 @@ * * Compute elements J+1:M of J-th column. * - IF( J.LT.M ) THEN - IF( ABS(A( J, J )) .GE. SFMIN ) THEN - CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) - ELSE - DO 20 I = 1, M-J - A( J+I, J ) = A( J+I, J ) / A( J, J ) - 20 CONTINUE - END IF - END IF + IF( J.LT.M ) + $ CALL ZRSCL( M-J, A( J, J ), A( J+1, J ), 1 ) * ELSE IF( INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/zgetri.f b/lapack-netlib/SRC/zgetri.f index 41782841c..f3806a77c 100644 --- a/lapack-netlib/SRC/zgetri.f +++ b/lapack-netlib/SRC/zgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -152,7 +152,7 @@ * INFO = 0 NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 17c6d5146..26311c611 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -192,7 +192,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE * .. * .. External Subroutines .. EXTERNAL ZGEQR, ZGEMQR, ZLASCL, ZLASET, @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL ZGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -297,7 +300,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f index 5f0167937..e7ce993aa 100644 --- a/lapack-netlib/SRC/zgetsqrhrt.f +++ b/lapack-netlib/SRC/zgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpex16OTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +214,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +227,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +265,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * diff --git a/lapack-netlib/SRC/zgges3.f b/lapack-netlib/SRC/zgges3.f index 8b3e44f88..8235c2543 100644 --- a/lapack-netlib/SRC/zgges3.f +++ b/lapack-netlib/SRC/zgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N) +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE ZGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -309,9 +311,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,28 +380,32 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL ZUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) END IF CALL ZGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL ZLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL ZTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -420,7 +427,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f index 2d6c74582..0cc073470 100644 --- a/lapack-netlib/SRC/zggev3.f +++ b/lapack-netlib/SRC/zggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE ZGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKMIN, LWKOPT DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX*16 X @@ -252,9 +253,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = DLAMCH( 'E' )*DLAMCH( 'B' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zgghd3.f b/lapack-netlib/SRC/zgghd3.f index b29cdc70a..08343688d 100644 --- a/lapack-netlib/SRC/zgghd3.f +++ b/lapack-netlib/SRC/zgghd3.f @@ -176,14 +176,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -208,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'ZGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DCMPLX( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -883,6 +887,7 @@ IF ( JCOL.LT.IHI ) $ CALL ZGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DCMPLX( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f index 0388b0874..d8636d663 100644 --- a/lapack-netlib/SRC/zggqrf.f +++ b/lapack-netlib/SRC/zggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'ZGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f index be912c772..69c14af24 100644 --- a/lapack-netlib/SRC/zggrqf.f +++ b/lapack-netlib/SRC/zggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'ZGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggsvd3.f b/lapack-netlib/SRC/zggsvd3.f index 71257a7c0..40624f5be 100644 --- a/lapack-netlib/SRC/zggsvd3.f +++ b/lapack-netlib/SRC/zggsvd3.f @@ -277,7 +277,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -332,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/zggsvp3.f b/lapack-netlib/SRC/zggsvp3.f index f39ccdad3..7b465aaee 100644 --- a/lapack-netlib/SRC/zggsvp3.f +++ b/lapack-netlib/SRC/zggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f index 672ab7590..075c243c2 100644 --- a/lapack-netlib/SRC/zgsvj0.f +++ b/lapack-netlib/SRC/zgsvj0.f @@ -52,10 +52,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX*16 array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f index 2461a9555..ba71e155f 100644 --- a/lapack-netlib/SRC/zgsvj1.f +++ b/lapack-netlib/SRC/zgsvj1.f @@ -75,10 +75,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX*16 array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/zhbevd.f b/lapack-netlib/SRC/zhbevd.f index 0db551540..be9f01556 100644 --- a/lapack-netlib/SRC/zhbevd.f +++ b/lapack-netlib/SRC/zhbevd.f @@ -41,12 +41,6 @@ *> a complex Hermitian band matrix A. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhbevd_2stage.f b/lapack-netlib/SRC/zhbevd_2stage.f index 4522d5e79..e32c7125c 100644 --- a/lapack-netlib/SRC/zhbevd_2stage.f +++ b/lapack-netlib/SRC/zhbevd_2stage.f @@ -47,12 +47,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhbgvd.f b/lapack-netlib/SRC/zhbgvd.f index b0664750e..4bd02168d 100644 --- a/lapack-netlib/SRC/zhbgvd.f +++ b/lapack-netlib/SRC/zhbgvd.f @@ -46,12 +46,6 @@ *> and banded, and B is also positive definite. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index 7f58c7f72..8e86b9e88 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -41,12 +41,6 @@ *> complex Hermitian matrix A. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -122,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is DOUBLE PRECISION array, -*> dimension (LRWORK) +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -186,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevd * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevd_2stage.f b/lapack-netlib/SRC/zheevd_2stage.f index 9859b0d67..e697a9823 100644 --- a/lapack-netlib/SRC/zheevd_2stage.f +++ b/lapack-netlib/SRC/zheevd_2stage.f @@ -46,12 +46,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f index 1452e04a3..fe6e1a85f 100644 --- a/lapack-netlib/SRC/zheevr.f +++ b/lapack-netlib/SRC/zheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for ZHETRD and for *> ZUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -338,7 +341,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,7 +463,7 @@ NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'ZUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -710,7 +719,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f index 5c576e633..b1cc7175f 100644 --- a/lapack-netlib/SRC/zheevr_2stage.f +++ b/lapack-netlib/SRC/zheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by ZUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -472,9 +475,16 @@ IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -535,7 +545,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -643,9 +653,9 @@ * * Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired diff --git a/lapack-netlib/SRC/zhegv.f b/lapack-netlib/SRC/zhegv.f index 41657e3be..c973bd0fc 100644 --- a/lapack-netlib/SRC/zhegv.f +++ b/lapack-netlib/SRC/zhegv.f @@ -160,7 +160,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhegv_2stage.f b/lapack-netlib/SRC/zhegv_2stage.f index fda651e5e..91ac09311 100644 --- a/lapack-netlib/SRC/zhegv_2stage.f +++ b/lapack-netlib/SRC/zhegv_2stage.f @@ -179,7 +179,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f index eeda656ad..c9ff55e3d 100644 --- a/lapack-netlib/SRC/zhegvd.f +++ b/lapack-netlib/SRC/zhegvd.f @@ -43,12 +43,6 @@ *> B are assumed to be Hermitian and B is also positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -212,7 +206,7 @@ *> the submatrix lying in rows and columns INFO/(N+1) *> through mod(INFO,N+1); *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhegvx.f b/lapack-netlib/SRC/zhegvx.f index ac9f9ef1a..71ed1c4ca 100644 --- a/lapack-netlib/SRC/zhegvx.f +++ b/lapack-netlib/SRC/zhegvx.f @@ -280,7 +280,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index df8498c7a..b3d4b3725 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -128,7 +128,7 @@ *> LWORK is INTEGER *> The length of WORK. LWORK >= MAX(1,2*N,3*N-2), and for best *> performance LWORK >= max(1,N*NB), where NB is the optimal -*> blocksize for ZHETRF. +*> blocksize for ZHETRF_AA. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index ee1596855..c503b5554 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -88,7 +88,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -178,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -208,7 +209,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKOPT, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -229,6 +230,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -237,18 +239,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhesvx.f b/lapack-netlib/SRC/zhesvx.f index 485c81df6..64aa16674 100644 --- a/lapack-netlib/SRC/zhesvx.f +++ b/lapack-netlib/SRC/zhesvx.f @@ -234,8 +234,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= max(1,2*N), and for best -*> performance, when FACT = 'N', LWORK >= max(1,2*N,N*NB), where +*> The length of WORK. LWORK >= MAX(1,2*N), and for best +*> performance, when FACT = 'N', LWORK >= MAX(1,2*N,N*NB), where *> NB is the optimal blocksize for ZHETRF. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE ZHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKOPT, LWKMIN, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,12 +347,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/zhetf2_rk.f b/lapack-netlib/SRC/zhetf2_rk.f index 050c7993d..87df901aa 100644 --- a/lapack-netlib/SRC/zhetf2_rk.f +++ b/lapack-netlib/SRC/zhetf2_rk.f @@ -480,7 +480,7 @@ A( J, K ) = DCONJG( A( P, J ) ) A( P, J ) = T 14 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = DCONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( K, K ) ) @@ -508,7 +508,7 @@ A( J, KK ) = DCONJG( A( KP, J ) ) A( KP, J ) = T 15 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = DCONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( KK, KK ) ) @@ -834,7 +834,7 @@ A( J, K ) = DCONJG( A( P, J ) ) A( P, J ) = T 44 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = DCONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( K, K ) ) @@ -862,7 +862,7 @@ A( J, KK ) = DCONJG( A( KP, J ) ) A( KP, J ) = T 45 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = DCONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( KK, KK ) ) diff --git a/lapack-netlib/SRC/zhetf2_rook.f b/lapack-netlib/SRC/zhetf2_rook.f index 94bb29736..91172f601 100644 --- a/lapack-netlib/SRC/zhetf2_rook.f +++ b/lapack-netlib/SRC/zhetf2_rook.f @@ -420,7 +420,7 @@ A( J, K ) = DCONJG( A( P, J ) ) A( P, J ) = T 14 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = DCONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( K, K ) ) @@ -441,7 +441,7 @@ A( J, KK ) = DCONJG( A( KP, J ) ) A( KP, J ) = T 15 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = DCONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( KK, KK ) ) @@ -733,7 +733,7 @@ A( J, K ) = DCONJG( A( P, J ) ) A( P, J ) = T 44 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = DCONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( K, K ) ) @@ -754,7 +754,7 @@ A( J, KK ) = DCONJG( A( KP, J ) ) A( KP, J ) = T 45 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = DCONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( KK, KK ) ) diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f index b9d2f0eb1..ab444894b 100644 --- a/lapack-netlib/SRC/zhetrd_2stage.f +++ b/lapack-netlib/SRC/zhetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download ZHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download ZHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX*16 A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX*16 array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX*16 array, dimension (LHOUS2) +*> HOUS2 is COMPLEX*16 array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HE2HB', -INFO ) RETURN END IF - CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HB2ST', -INFO ) @@ -324,7 +332,6 @@ END IF * * - HOUS2( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index e839271a4..247497ab6 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the zhetrd_he2hb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the zhetrd_he2hb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the zhetrd_he2hb *> routine has been called to produce AB (e.g., AB is *> the output of zhetrd_he2hb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX*16 array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension LWORK. +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * @@ -259,11 +264,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIZEV, SIZETAU, LDV, LHMIN, LWMIN DOUBLE PRECISION ABSTMP @@ -277,7 +282,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -285,7 +290,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -294,9 +298,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -358,7 +367,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -366,11 +375,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* complex because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -385,17 +394,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = DBLE( AB( ABDPOS, I ) ) @@ -444,7 +453,7 @@ C END IF RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the hermitian band of A to a tridiagonal matrix. * THGRSIZ = N @@ -453,7 +462,7 @@ C END IF NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL ZLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -462,7 +471,7 @@ C END IF * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -471,7 +480,7 @@ C END IF #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -482,7 +491,7 @@ C END IF ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -508,17 +517,17 @@ C END IF ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -526,20 +535,20 @@ C END IF !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, - $ WORK( INDW + TID*KD ) ) -#endif + $ WORK( INDW ) ) +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -554,14 +563,14 @@ C END IF !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -575,11 +584,10 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of ZHETRD_HB2ST * END - + diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e1b2e1794..3e3bfa374 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zhetrf.f b/lapack-netlib/SRC/zhetrf.f index 78d4f71b8..433887108 100644 --- a/lapack-netlib/SRC/zhetrf.f +++ b/lapack-netlib/SRC/zhetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -227,7 +227,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * @@ -346,6 +346,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f index 56722e7e6..381c87d51 100644 --- a/lapack-netlib/SRC/zhetrf_aa.f +++ b/lapack-netlib/SRC/zhetrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N >= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX*16 ALPHA * .. @@ -178,18 +180,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -202,11 +211,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = DBLE( A( 1, 1 ) ) RETURN END IF diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 92f1c09b3..bab13a99d 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16SYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -182,7 +182,7 @@ * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY INTEGER I, J, K, I1, I2, TD - INTEGER LDTB, NB, KB, JB, NT, IINFO + INTEGER LWKOPT, LDTB, NB, KB, JB, NT, IINFO COMPLEX*16 PIV * .. * .. External Functions .. @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'ZHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * @@ -392,7 +392,7 @@ CALL ZGETRF( N-(J+1)*NB, NB, $ WORK, N, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * @@ -587,7 +587,7 @@ c END IF CALL ZGETRF( N-(J+1)*NB, NB, $ A( (J+1)*NB+1, J*NB+1 ), LDA, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * diff --git a/lapack-netlib/SRC/zhetrf_rk.f b/lapack-netlib/SRC/zhetrf_rk.f index 73dd9f9d0..7c505fa4d 100644 --- a/lapack-netlib/SRC/zhetrf_rk.f +++ b/lapack-netlib/SRC/zhetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhetrf_rook.f b/lapack-netlib/SRC/zhetrf_rook.f index e9de47248..a56349092 100644 --- a/lapack-netlib/SRC/zhetrf_rook.f +++ b/lapack-netlib/SRC/zhetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f index 384745c3a..1d932b866 100644 --- a/lapack-netlib/SRC/zhetri2.f +++ b/lapack-netlib/SRC/zhetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE ZHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL ZHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL ZHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of ZHETRI2 diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f index 06ac1fd28..b7a1f7f07 100644 --- a/lapack-netlib/SRC/zhetrs_aa.f +++ b/lapack-netlib/SRC/zhetrs_aa.f @@ -106,7 +106,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -124,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -152,7 +158,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -162,13 +168,19 @@ EXTERNAL ZGTSV, ZSWAP, ZTRSM, ZLACGV, ZLACPY, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/zhpevd.f b/lapack-netlib/SRC/zhpevd.f index 7625c8fe8..5260aaf14 100644 --- a/lapack-netlib/SRC/zhpevd.f +++ b/lapack-netlib/SRC/zhpevd.f @@ -41,12 +41,6 @@ *> a complex Hermitian matrix A in packed storage. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhpgv.f b/lapack-netlib/SRC/zhpgv.f index 72876b6e4..b92168555 100644 --- a/lapack-netlib/SRC/zhpgv.f +++ b/lapack-netlib/SRC/zhpgv.f @@ -144,7 +144,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not convergeto zero; *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f index e96e39738..dfe92067c 100644 --- a/lapack-netlib/SRC/zhpgvd.f +++ b/lapack-netlib/SRC/zhpgvd.f @@ -44,12 +44,6 @@ *> positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -205,7 +199,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not convergeto zero; *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhpgvx.f b/lapack-netlib/SRC/zhpgvx.f index 94d7f7733..de75b486b 100644 --- a/lapack-netlib/SRC/zhpgvx.f +++ b/lapack-netlib/SRC/zhpgvx.f @@ -250,7 +250,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zla_gbrfsx_extended.f b/lapack-netlib/SRC/zla_gbrfsx_extended.f index fe4d635b1..2f57b7682 100644 --- a/lapack-netlib/SRC/zla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/zla_gbrfsx_extended.f @@ -651,7 +651,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zla_gerfsx_extended.f b/lapack-netlib/SRC/zla_gerfsx_extended.f index 9d618f294..22e45c5a9 100644 --- a/lapack-netlib/SRC/zla_gerfsx_extended.f +++ b/lapack-netlib/SRC/zla_gerfsx_extended.f @@ -636,7 +636,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zla_herfsx_extended.f b/lapack-netlib/SRC/zla_herfsx_extended.f index a55dd9431..689460a02 100644 --- a/lapack-netlib/SRC/zla_herfsx_extended.f +++ b/lapack-netlib/SRC/zla_herfsx_extended.f @@ -655,7 +655,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zla_porfsx_extended.f b/lapack-netlib/SRC/zla_porfsx_extended.f index 12e05e049..e853494fc 100644 --- a/lapack-netlib/SRC/zla_porfsx_extended.f +++ b/lapack-netlib/SRC/zla_porfsx_extended.f @@ -626,7 +626,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zla_porpvgrw.f b/lapack-netlib/SRC/zla_porpvgrw.f index 9b381a072..897589aa0 100644 --- a/lapack-netlib/SRC/zla_porpvgrw.f +++ b/lapack-netlib/SRC/zla_porpvgrw.f @@ -142,9 +142,9 @@ * .. Executable Statements .. UPPER = LSAME( 'Upper', UPLO ) * -* DPOTRF will have factored only the NCOLSxNCOLS leading minor, so -* we restrict the growth search to that minor and use only the first -* 2*NCOLS workspace entries. +* DPOTRF will have factored only the NCOLSxNCOLS leading submatrix, +* so we restrict the growth search to that submatrix and use only +* the first 2*NCOLS workspace entries. * RPVGRW = 1.0D+0 DO I = 1, 2*NCOLS diff --git a/lapack-netlib/SRC/zla_syrfsx_extended.f b/lapack-netlib/SRC/zla_syrfsx_extended.f index d6c241499..fb1b9e2d7 100644 --- a/lapack-netlib/SRC/zla_syrfsx_extended.f +++ b/lapack-netlib/SRC/zla_syrfsx_extended.f @@ -655,7 +655,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zlaed7.f b/lapack-netlib/SRC/zlaed7.f index 83f32d8b8..86e5ec6b5 100644 --- a/lapack-netlib/SRC/zlaed7.f +++ b/lapack-netlib/SRC/zlaed7.f @@ -363,7 +363,7 @@ RETURN END IF * -* Prepare the INDXQ sorting premutation. +* Prepare the INDXQ sorting permutation. * N1 = K N2 = N - K diff --git a/lapack-netlib/SRC/zlaed8.f b/lapack-netlib/SRC/zlaed8.f index 995a673de..003725820 100644 --- a/lapack-netlib/SRC/zlaed8.f +++ b/lapack-netlib/SRC/zlaed8.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA, +* SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA, * Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR, * GIVCOL, GIVNUM, INFO ) * @@ -29,7 +29,7 @@ * .. Array Arguments .. * INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), * $ INDXQ( * ), PERM( * ) -* DOUBLE PRECISION D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ), * $ Z( * ) * COMPLEX*16 Q( LDQ, * ), Q2( LDQ2, * ) * .. @@ -122,9 +122,9 @@ *> destroyed during the updating process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (N) +*> DLAMBDA is DOUBLE PRECISION array, dimension (N) *> Contains a copy of the first K eigenvalues which will be used *> by DLAED3 to form the secular equation. *> \endverbatim @@ -222,7 +222,7 @@ *> \ingroup complex16OTHERcomputational * * ===================================================================== - SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA, + SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA, $ Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR, $ GIVCOL, GIVNUM, INFO ) * @@ -237,7 +237,7 @@ * .. Array Arguments .. INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), $ INDXQ( * ), PERM( * ) - DOUBLE PRECISION D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ), $ Z( * ) COMPLEX*16 Q( LDQ, * ), Q2( LDQ2, * ) * .. @@ -322,14 +322,14 @@ INDXQ( I ) = INDXQ( I ) + CUTPNT 20 CONTINUE DO 30 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) W( I ) = Z( INDXQ( I ) ) 30 CONTINUE I = 1 J = CUTPNT + 1 - CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX ) + CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX ) DO 40 I = 1, N - D( I ) = DLAMDA( INDX( I ) ) + D( I ) = DLAMBDA( INDX( I ) ) Z( I ) = W( INDX( I ) ) 40 CONTINUE * @@ -438,7 +438,7 @@ ELSE K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM JLAM = J END IF @@ -450,19 +450,19 @@ * K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM * 100 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * DO 110 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) CALL ZCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 ) 110 CONTINUE @@ -471,7 +471,7 @@ * into the last N - K slots of D and Q respectively. * IF( K.LT.N ) THEN - CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) CALL ZLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ), $ LDQ ) END IF diff --git a/lapack-netlib/SRC/zlals0.f b/lapack-netlib/SRC/zlals0.f index 7a7310042..79c0cf5e4 100644 --- a/lapack-netlib/SRC/zlals0.f +++ b/lapack-netlib/SRC/zlals0.f @@ -392,6 +392,11 @@ $ ( POLES( I, 2 ).EQ.ZERO ) ) THEN RWORK( I ) = ZERO ELSE +* +* Use calls to the subroutine DLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* RWORK( I ) = POLES( I, 2 )*Z( I ) / $ ( DLAMC3( POLES( I, 2 ), DSIGJ )- $ DIFLJ ) / ( POLES( I, 2 )+DJ ) @@ -470,6 +475,11 @@ IF( Z( J ).EQ.ZERO ) THEN RWORK( I ) = ZERO ELSE +* +* Use calls to the subroutine DLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* RWORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1, $ 2 ) )-DIFR( I, 1 ) ) / $ ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 ) diff --git a/lapack-netlib/SRC/zlalsa.f b/lapack-netlib/SRC/zlalsa.f index 73ccf3a8b..d419598d2 100644 --- a/lapack-netlib/SRC/zlalsa.f +++ b/lapack-netlib/SRC/zlalsa.f @@ -42,9 +42,9 @@ *> *> \verbatim *> -*> ZLALSA is an itermediate step in solving the least squares problem +*> ZLALSA is an intermediate step in solving the least squares problem *> by computing the SVD of the coefficient matrix in compact form (The -*> singular vectors are computed as products of simple orthorgonal +*> singular vectors are computed as products of simple orthogonal *> matrices.). *> *> If ICOMPQ = 0, ZLALSA applies the inverse of the left singular vector diff --git a/lapack-netlib/SRC/zlalsd.f b/lapack-netlib/SRC/zlalsd.f index dca308e56..1d7358aa9 100644 --- a/lapack-netlib/SRC/zlalsd.f +++ b/lapack-netlib/SRC/zlalsd.f @@ -48,12 +48,6 @@ *> problem; in this case a minimum norm solution is returned. *> The actual singular values are returned in D in ascending order. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f index 4abefa434..59a0a5558 100644 --- a/lapack-netlib/SRC/zlamswlq.f +++ b/lapack-netlib/SRC/zlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,92 +192,103 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE ZLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA + EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL ZGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -403,7 +417,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMSWLQ diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f index 5030cb75f..03770c06e 100644 --- a/lapack-netlib/SRC/zlamtsqr.f +++ b/lapack-netlib/SRC/zlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,46 +193,50 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE ZLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA + EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +244,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +265,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL ZGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +422,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMTSQR diff --git a/lapack-netlib/SRC/zlaqp2rk.c b/lapack-netlib/SRC/zlaqp2rk.c new file mode 100644 index 000000000..0d38e71fb --- /dev/null +++ b/lapack-netlib/SRC/zlaqp2rk.c @@ -0,0 +1,947 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + i__3 = j; + tau[i__3].r = 0., tau[i__3].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = CZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + zlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + i__2 = kk; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since ZLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by ZLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by ZLARFG is covered by checking TAU(KK) for NaN. */ + + i__2 = kk; + d__1 = tau[i__2].r; + if (disnan_(&d__1)) { + i__2 = kk; + taunan = tau[i__2].r; + } else /* if(complicated condition) */ { + d__1 = d_imag(&tau[kk]); + if (disnan_(&d__1)) { + taunan = d_imag(&tau[kk]); + } else { + taunan = 0.; + } + } + + if (disnan_(&taunan)) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + i__2 = i__ + kk * a_dim1; + aikk.r = a[i__2].r, aikk.i = a[i__2].i; + i__2 = i__ + kk * a_dim1; + a[i__2].r = 1., a[i__2].i = 0.; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + d_cnjg(&z__1, &tau[kk]); + zlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &z__1, + &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + i__2 = i__ + kk * a_dim1; + a[i__2].r = aikk.r, a[i__2].i = aikk.i; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + d__1 = z_abs(&a[i__ + j * a_dim1]) / vn1[j]; + temp = 1. - d__1 * d__1; + temp = f2cmax(temp,0.); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = dznrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + return 0; + +/* End of ZLAQP2RK */ + +} /* zlaqp2rk_ */ + diff --git a/lapack-netlib/SRC/zlaqp2rk.f b/lapack-netlib/SRC/zlaqp2rk.f new file mode 100644 index 000000000..f6bf555c2 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp2rk.f @@ -0,0 +1,726 @@ +*> \brief \b ZLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* DOUBLE PRECISION VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* $ +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of the complex matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine ZGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine ZGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the unitary matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (N-1) +*> Used in ZLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + DOUBLE PRECISION VN1( * ), VN2( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + DOUBLE PRECISION HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX*16 AIKK +* .. +* .. External Subroutines .. + EXTERNAL ZLARF, ZLARFG, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DZNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL ZSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = CZERO. +* + IF( I.LT.M ) THEN + CALL ZLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = CZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since ZLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by ZLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by ZLARFG is covered by checking TAU(KK) for NaN. +* + IF( DISNAN( DBLE( TAU(KK) ) ) ) THEN + TAUNAN = DBLE( TAU(KK) ) + ELSE IF( DISNAN( DIMAG( TAU(KK) ) ) ) THEN + TAUNAN = DIMAG( TAU(KK) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( DISNAN( TAUNAN ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = CONE + CALL ZLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ DCONJG( TAU( KK ) ), A( I, KK+1 ), LDA, + $ WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = DZNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = CZERO + END DO +* + RETURN +* +* End of ZLAQP2RK +* + END diff --git a/lapack-netlib/SRC/zlaqp3rk.c b/lapack-netlib/SRC/zlaqp3rk.c new file mode 100644 index 000000000..cb44e4d34 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp3rk.c @@ -0,0 +1,1157 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, & + i__2, kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[* + kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb + + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + zswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */ + + if (k > 1) { + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + d_cnjg(&z__1, &f[k + j * f_dim1]); + f[i__2].r = z__1.r, f[i__2].i = z__1.i; + } + i__1 = *m - i__ + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = 0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1); + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + d_cnjg(&z__1, &f[k + j * f_dim1]); + f[i__2].r = z__1.r, f[i__2].i = z__1.i; + } + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + zlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + i__1 = k; + tau[i__1].r = 0., tau[i__1].i = 0.; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since ZLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by ZLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by ZLARFG is covered by checking TAU(K) for NaN. */ + + i__1 = k; + d__1 = tau[i__1].r; + if (disnan_(&d__1)) { + i__1 = k; + taunan = tau[i__1].r; + } else /* if(complicated condition) */ { + d__1 = d_imag(&tau[k]); + if (disnan_(&d__1)) { + taunan = d_imag(&tau[k]); + } else { + taunan = 0.; + } + } + + if (disnan_(&taunan)) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb, + &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1], + lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + i__1 = i__ + k * a_dim1; + aik.r = a[i__1].r, aik.i = a[i__1].i; + i__1 = i__ + k * a_dim1; + a[i__1].r = 1., a[i__1].i = 0.; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + zgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, & + f[k + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + i__2 = j + k * f_dim1; + f[i__2].r = 0., f[i__2].i = 0.; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + i__3 = k; + z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i; + zgemv_("Conjugate Transpose", &i__1, &i__2, &z__1, &a[i__ + + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1] + , &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + zgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, & + z__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, & + c_b2, &a[i__ + (k + 1) * a_dim1], lda); + } + + i__1 = i__ + k * a_dim1; + a[i__1].r = aik.r, a[i__1].i = aik.i; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = z_abs(&a[i__ + j * a_dim1]) / vn1[j]; +/* Computing MAX */ + d__1 = 0., d__2 = (temp + 1.) * (1. - temp); + temp = f2cmax(d__1,d__2); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &z__1, + &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2, + &a[if__ + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* DZNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(DLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = dznrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of ZLAQP3RK */ + +} /* zlaqp3rk_ */ + diff --git a/lapack-netlib/SRC/zlaqp3rk.f b/lapack-netlib/SRC/zlaqp3rk.f new file mode 100644 index 000000000..0dd8bf8e3 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp3rk.f @@ -0,0 +1,947 @@ +*> \brief \b ZLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine ZGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine ZGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the unitary matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is COMPLEX*16 array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is COMPLEX*16 array, dimension (LDF,NB) +*> Matrix F**H = L*(Y**H)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION VN1( * ), VN2( * ) + COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + DOUBLE PRECISION HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX*16 AIK +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZGEMV, ZLARFG, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DZNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL ZSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL ZSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. +* + IF( K.GT.1 ) THEN + DO J = 1, K - 1 + F( K, J ) = DCONJG( F( K, J ) ) + END DO + CALL ZGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 ) + DO J = 1, K - 1 + F( K, J ) = DCONJG( F( K, J ) ) + END DO + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL ZLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = CZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since ZLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by ZLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by ZLARFG is covered by checking TAU(K) for NaN. +* + IF( DISNAN( DBLE( TAU(K) ) ) ) THEN + TAUNAN = DBLE( TAU(K) ) + ELSE IF( DISNAN( DIMAG( TAU(K) ) ) ) THEN + TAUNAN = DIMAG( TAU(K) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( DISNAN( TAUNAN ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = CONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL ZGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ CZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = CZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL ZGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, CZERO, + $ AUXV( 1 ), 1 ) +* + CALL ZGEMV( 'No transpose', N+NRHS, K-1, CONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. +* + IF( K.LT.N+NRHS ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ 1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA, + $ F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* DZNRM2 does not fail on vectors with norm below the value of +* SQRT(DLAMCH('S')) +* + VN1( LSTICC ) = DZNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of ZLAQP3RK +* + END diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f index 3e20200ed..dcb28850a 100644 --- a/lapack-netlib/SRC/zlaqz0.f +++ b/lapack-netlib/SRC/zlaqz0.f @@ -89,7 +89,7 @@ *> Anal., 29(2006), pp. 199--227. *> *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift, -*> multipole rational QZ method with agressive early deflation" +*> multipole rational QZ method with aggressive early deflation" *> \endverbatim * * Arguments: @@ -312,7 +312,7 @@ CHARACTER :: JBCMPZ*3 * External Functions - EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD, + EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, $ ZLARTG, ZROT DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS LOGICAL, EXTERNAL :: LSAME @@ -464,7 +464,6 @@ * Get machine constants SAFMIN = DLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) @@ -535,7 +534,7 @@ DO WHILE ( K.GE.ISTART2 ) IF( ABS( B( K, K ) ) .LT. BTOL ) THEN -* A diagonal element of B is negligable, move it +* A diagonal element of B is negligible, move it * to the top and deflate it DO K2 = K, ISTART2+1, -1 diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f index 4a3c4dcf1..17d4b33aa 100644 --- a/lapack-netlib/SRC/zlarfb_gett.f +++ b/lapack-netlib/SRC/zlarfb_gett.f @@ -452,7 +452,7 @@ IF( LNOTIDENT ) THEN * * col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, -* V1 is not an identy matrix, but unit lower-triangular +* V1 is not an identity matrix, but unit lower-triangular * V1 stored in A1 (diagonal ones are not stored). * * diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f index 77eba8e86..d54f2ea5d 100644 --- a/lapack-netlib/SRC/zlarfgp.f +++ b/lapack-netlib/SRC/zlarfgp.f @@ -97,7 +97,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERauxiliary +*> \ingroup larfgp * * ===================================================================== SUBROUTINE ZLARFGP( N, ALPHA, X, INCX, TAU ) @@ -122,7 +122,7 @@ * .. * .. Local Scalars .. INTEGER J, KNT - DOUBLE PRECISION ALPHI, ALPHR, BETA, BIGNUM, SMLNUM, XNORM + DOUBLE PRECISION ALPHI, ALPHR, BETA, BIGNUM, EPS, SMLNUM, XNORM COMPLEX*16 SAVEALPHA * .. * .. External Functions .. @@ -143,37 +143,28 @@ RETURN END IF * + EPS = DLAMCH( 'Precision' ) XNORM = DZNRM2( N-1, X, INCX ) ALPHR = DBLE( ALPHA ) ALPHI = DIMAG( ALPHA ) * - IF( XNORM.EQ.ZERO ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = DLAPY2( ALPHR, ALPHI ) - TAU = DCMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * diff --git a/lapack-netlib/SRC/zlassq.f90 b/lapack-netlib/SRC/zlassq.f90 index 9346dacac..c35214766 100644 --- a/lapack-netlib/SRC/zlassq.f90 +++ b/lapack-netlib/SRC/zlassq.f90 @@ -34,28 +34,15 @@ !> !> \verbatim !> -!> ZLASSQ returns the values scl and smsq such that +!> ZLASSQ returns the values scale_out and sumsq_out such that !> -!> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, +!> (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq, !> -!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is +!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is !> assumed to be non-negative. !> !> scale and sumsq must be supplied in SCALE and SUMSQ and -!> scl and smsq are overwritten on SCALE and SUMSQ respectively. -!> -!> If scale * sqrt( sumsq ) > tbig then -!> we require: scale >= sqrt( TINY*EPS ) / sbig on entry, -!> and if 0 < scale * sqrt( sumsq ) < tsml then -!> we require: scale <= sqrt( HUGE ) / ssml on entry, -!> where -!> tbig -- upper threshold for values whose square is representable; -!> sbig -- scaling constant for big numbers; \see la_constants.f90 -!> tsml -- lower threshold for values whose square is representable; -!> ssml -- scaling constant for small numbers; \see la_constants.f90 -!> and -!> TINY*EPS -- tiniest representable number; -!> HUGE -- biggest representable number. +!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively. !> !> \endverbatim ! @@ -72,7 +59,7 @@ !> \verbatim !> X is DOUBLE COMPLEX array, dimension (1+(N-1)*abs(INCX)) !> The vector for which a scaled sum of squares is computed. -!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. +!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. !> \endverbatim !> !> \param[in] INCX @@ -82,24 +69,24 @@ !> If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n !> If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n !> If INCX = 0, x isn't a vector so there is no need to call -!> this subroutine. If you call it anyway, it will count x(1) +!> this subroutine. If you call it anyway, it will count x(1) !> in the vector norm N times. !> \endverbatim !> !> \param[in,out] SCALE !> \verbatim !> SCALE is DOUBLE PRECISION -!> On entry, the value scale in the equation above. -!> On exit, SCALE is overwritten with scl , the scaling factor +!> On entry, the value scale in the equation above. +!> On exit, SCALE is overwritten by scale_out, the scaling factor !> for the sum of squares. !> \endverbatim !> !> \param[in,out] SUMSQ !> \verbatim !> SUMSQ is DOUBLE PRECISION -!> On entry, the value sumsq in the equation above. -!> On exit, SUMSQ is overwritten with smsq , the basic sum of -!> squares from which scl has been factored out. +!> On entry, the value sumsq in the equation above. +!> On exit, SUMSQ is overwritten by sumsq_out, the basic sum of +!> squares from which scale_out has been factored out. !> \endverbatim ! ! Authors: @@ -130,10 +117,10 @@ !> !> \endverbatim ! -!> \ingroup OTHERauxiliary +!> \ingroup lassq ! ! ===================================================================== -subroutine ZLASSQ( n, x, incx, scl, sumsq ) +subroutine ZLASSQ( n, x, incx, scale, sumsq ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, one=>done, & sbig=>dsbig, ssml=>dssml, tbig=>dtbig, tsml=>dtsml @@ -145,7 +132,7 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) ! ! .. Scalar Arguments .. integer :: incx, n - real(wp) :: scl, sumsq + real(wp) :: scale, sumsq ! .. ! .. Array Arguments .. complex(wp) :: x(*) @@ -158,10 +145,10 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) ! ! Quick return if possible ! - if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return - if( sumsq == zero ) scl = one - if( scl == zero ) then - scl = one + if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return + if( sumsq == zero ) scale = one + if( scale == zero ) then + scale = one sumsq = zero end if if (n <= 0) then @@ -207,15 +194,27 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) ! Put the existing sum of squares into one of the accumulators ! if( sumsq > zero ) then - ax = scl*sqrt( sumsq ) + ax = scale*sqrt( sumsq ) if (ax > tbig) then -! We assume scl >= sqrt( TINY*EPS ) / sbig - abig = abig + (scl*sbig)**2 * sumsq + if (scale > one) then + scale = scale * sbig + abig = abig + scale * (scale * sumsq) + else + ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable + abig = abig + scale * (scale * (sbig * (sbig * sumsq))) + end if else if (ax < tsml) then -! We assume scl <= sqrt( HUGE ) / ssml - if (notbig) asml = asml + (scl*ssml)**2 * sumsq + if (notbig) then + if (scale < one) then + scale = scale * ssml + asml = asml + scale * (scale * sumsq) + else + ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable + asml = asml + scale * (scale * (ssml * (ssml * sumsq))) + end if + end if else - amed = amed + scl**2 * sumsq + amed = amed + scale * (scale * sumsq) end if end if ! @@ -229,7 +228,7 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) if (amed > zero .or. LA_ISNAN(amed)) then abig = abig + (amed*sbig)*sbig end if - scl = one / sbig + scale = one / sbig sumsq = abig else if (asml > zero) then ! @@ -245,17 +244,17 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) ymin = asml ymax = amed end if - scl = one + scale = one sumsq = ymax**2*( one + (ymin/ymax)**2 ) else - scl = one / ssml + scale = one / ssml sumsq = asml end if else ! ! Otherwise all values are mid-range or zero ! - scl = one + scale = one sumsq = amed end if return diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f index e4e703343..735207132 100644 --- a/lapack-netlib/SRC/zlaswlq.f +++ b/lapack-netlib/SRC/zlaswlq.f @@ -96,21 +96,23 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> \param[in] LWORK *> \verbatim -*> The dimension of the array WORK. LWORK >= MB*M. +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -158,33 +160,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL ZGELQT, ZTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -195,12 +201,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -208,60 +221,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL ZTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = LWMIN RETURN * * End of ZLASWLQ diff --git a/lapack-netlib/SRC/zlatdf.f b/lapack-netlib/SRC/zlatdf.f index 25e71edce..41714a3e0 100644 --- a/lapack-netlib/SRC/zlatdf.f +++ b/lapack-netlib/SRC/zlatdf.f @@ -227,7 +227,7 @@ BM = RHS( J ) - CONE SPLUS = ONE * -* Lockahead for L- part RHS(1:N-1) = +-1 +* Look-ahead for L- part RHS(1:N-1) = +-1 * SPLUS and SMIN computed more efficiently than in BSOLVE[1]. * SPLUS = SPLUS + DBLE( ZDOTC( N-J, Z( J+1, J ), 1, Z( J+1, diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f index fc1be0517..27eac839b 100644 --- a/lapack-netlib/SRC/zlatrs3.f +++ b/lapack-netlib/SRC/zlatrs3.f @@ -158,7 +158,11 @@ *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,7 +262,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -296,15 +301,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters. * @@ -326,7 +340,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -577,7 +591,7 @@ * Prepare the linear update to be executed with GEMM. * For each column, compute a consistent scaling, a * scaling factor to survive the linear update, and -* rescale the column segments, if necesssary. Then +* rescale the column segments, if necessary. Then * the linear update is safely executed. * DO KK = 1, K2 - K1 diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f index ffdbc68c0..24d00f28a 100644 --- a/lapack-netlib/SRC/zlatsqr.f +++ b/lapack-netlib/SRC/zlatsqr.f @@ -101,14 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> The dimension of the array WORK. LWORK >= NB*N. +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -160,33 +164,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL ZGEQRT, ZTPQRT, XERBLA + EXTERNAL ZGEQRT, ZTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -197,6 +205,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -204,64 +219,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1,CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1,CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of ZLATSQR diff --git a/lapack-netlib/SRC/zpbsv.f b/lapack-netlib/SRC/zpbsv.f index fe6baf8b5..ef212bbc3 100644 --- a/lapack-netlib/SRC/zpbsv.f +++ b/lapack-netlib/SRC/zpbsv.f @@ -119,9 +119,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zpbsvx.f b/lapack-netlib/SRC/zpbsvx.f index 1efd2fd1e..724102376 100644 --- a/lapack-netlib/SRC/zpbsvx.f +++ b/lapack-netlib/SRC/zpbsvx.f @@ -70,7 +70,7 @@ *> where U is an upper triangular band matrix, and L is a lower *> triangular band matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -280,10 +280,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/zpbtf2.f b/lapack-netlib/SRC/zpbtf2.f index 7b90af47e..39bf6c3fa 100644 --- a/lapack-netlib/SRC/zpbtf2.f +++ b/lapack-netlib/SRC/zpbtf2.f @@ -97,8 +97,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpbtrf.f b/lapack-netlib/SRC/zpbtrf.f index 61cd1d434..80afb0836 100644 --- a/lapack-netlib/SRC/zpbtrf.f +++ b/lapack-netlib/SRC/zpbtrf.f @@ -92,8 +92,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpftrf.f b/lapack-netlib/SRC/zpftrf.f index 6d18f2cb3..4d4a5116e 100644 --- a/lapack-netlib/SRC/zpftrf.f +++ b/lapack-netlib/SRC/zpftrf.f @@ -91,8 +91,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> *> Further Notes on RFP Format: diff --git a/lapack-netlib/SRC/zposv.f b/lapack-netlib/SRC/zposv.f index 3bb625876..0e91cde2f 100644 --- a/lapack-netlib/SRC/zposv.f +++ b/lapack-netlib/SRC/zposv.f @@ -110,9 +110,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zposvx.f b/lapack-netlib/SRC/zposvx.f index f9e9b1d5f..6c06dbd57 100644 --- a/lapack-netlib/SRC/zposvx.f +++ b/lapack-netlib/SRC/zposvx.f @@ -70,7 +70,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -276,10 +276,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/zposvxx.f b/lapack-netlib/SRC/zposvxx.f index 6735fab71..3886c66f0 100644 --- a/lapack-netlib/SRC/zposvxx.f +++ b/lapack-netlib/SRC/zposvxx.f @@ -87,7 +87,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A (see argument RCOND). If the reciprocal of the condition number diff --git a/lapack-netlib/SRC/zpotf2.f b/lapack-netlib/SRC/zpotf2.f index eb88d617c..b48f3d654 100644 --- a/lapack-netlib/SRC/zpotf2.f +++ b/lapack-netlib/SRC/zpotf2.f @@ -89,8 +89,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpotrf.f b/lapack-netlib/SRC/zpotrf.f index 3b7018276..3edae84fb 100644 --- a/lapack-netlib/SRC/zpotrf.f +++ b/lapack-netlib/SRC/zpotrf.f @@ -87,8 +87,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpotrf2.f b/lapack-netlib/SRC/zpotrf2.f index 859ddc75f..67430f231 100644 --- a/lapack-netlib/SRC/zpotrf2.f +++ b/lapack-netlib/SRC/zpotrf2.f @@ -86,8 +86,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zppsv.f b/lapack-netlib/SRC/zppsv.f index 19536e204..f466266e4 100644 --- a/lapack-netlib/SRC/zppsv.f +++ b/lapack-netlib/SRC/zppsv.f @@ -104,9 +104,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zppsvx.f b/lapack-netlib/SRC/zppsvx.f index f94badf78..60d07cbc7 100644 --- a/lapack-netlib/SRC/zppsvx.f +++ b/lapack-netlib/SRC/zppsvx.f @@ -69,7 +69,7 @@ *> where U is an upper triangular matrix, L is a lower triangular *> matrix, and **H indicates conjugate transpose. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -262,10 +262,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/zpptrf.f b/lapack-netlib/SRC/zpptrf.f index a34d63913..a103e5eca 100644 --- a/lapack-netlib/SRC/zpptrf.f +++ b/lapack-netlib/SRC/zpptrf.f @@ -79,8 +79,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpteqr.f b/lapack-netlib/SRC/zpteqr.f index a81a6ad94..897136c76 100644 --- a/lapack-netlib/SRC/zpteqr.f +++ b/lapack-netlib/SRC/zpteqr.f @@ -123,8 +123,8 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, and i is: *> <= N the Cholesky factorization of the matrix could -*> not be performed because the i-th principal minor -*> was not positive definite. +*> not be performed because the leading principal +*> minor of order i was not positive. *> > N the SVD algorithm failed to converge; *> if INFO = N+i, i off-diagonal elements of the *> bidiagonal factor did not converge to zero. diff --git a/lapack-netlib/SRC/zptsv.f b/lapack-netlib/SRC/zptsv.f index d53e17a34..191adaff6 100644 --- a/lapack-netlib/SRC/zptsv.f +++ b/lapack-netlib/SRC/zptsv.f @@ -94,8 +94,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the solution has not been +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the solution has not been *> computed. The factorization has not been completed *> unless i = N. *> \endverbatim diff --git a/lapack-netlib/SRC/zptsvx.f b/lapack-netlib/SRC/zptsvx.f index 4a40768f1..94409a7a1 100644 --- a/lapack-netlib/SRC/zptsvx.f +++ b/lapack-netlib/SRC/zptsvx.f @@ -60,7 +60,7 @@ *> factorization can also be regarded as having the form *> A = U**H*D*U. *> -*> 2. If the leading i-by-i principal minor is not positive definite, +*> 2. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -205,10 +205,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/zpttrf.f b/lapack-netlib/SRC/zpttrf.f index a106ec419..75ef847c4 100644 --- a/lapack-netlib/SRC/zpttrf.f +++ b/lapack-netlib/SRC/zpttrf.f @@ -71,8 +71,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite; if k < N, the factorization could not +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive; if k < N, the factorization could not *> be completed, while if k = N, the factorization was *> completed, but D(N) <= 0. *> \endverbatim diff --git a/lapack-netlib/SRC/zrscl.c b/lapack-netlib/SRC/zrscl.c new file mode 100644 index 000000000..2264b5465 --- /dev/null +++ b/lapack-netlib/SRC/zrscl.c @@ -0,0 +1,735 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download ZDRSCL + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZRSCL( N, A, X, INCX ) */ + +/* INTEGER INCX, N */ +/* COMPLEX*16 A */ +/* COMPLEX*16 X( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZRSCL multiplies an n-element complex vector x by the complex scalar */ +/* > 1/a. This is done without overflow or underflow as long as */ +/* > the final result x/a does not overflow or underflow. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of components of the vector x. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 */ +/* > The scalar a which is used to divide each component of x. */ +/* > A must not be 0, or the subroutine will divide by zero. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX*16 array, dimension */ +/* > (1+(N-1)*abs(INCX)) */ +/* > The n-element vector x. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] INCX */ +/* > \verbatim */ +/* > INCX is INTEGER */ +/* > The increment between successive values of the vector SX. */ +/* > > 0: SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i), 1< i<= n */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16OTHERauxiliary */ + +/* ===================================================================== */ +/* Subroutine */ int zrscl_(integer *n, doublecomplex *a, doublecomplex *x, + integer *incx) +{ + /* System generated locals */ + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + doublereal absi, absr; + extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + doublecomplex *, integer *); + doublereal ai, ar; + extern doublereal dlamch_(char *); + doublereal ui, ov, ur, safmin, safmax; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *), zdrscl_(integer *, doublereal *, + doublecomplex *, integer *); + + +/* -- LAPACK auxiliary routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Quick return if possible */ + + /* Parameter adjustments */ + --x; + + /* Function Body */ + if (*n <= 0) { + return 0; + } + +/* Get machine parameters */ + + safmin = dlamch_("S"); + safmax = 1. / safmin; + ov = dlamch_("O"); + +/* Initialize constants related to A. */ + + ar = a->r; + ai = d_imag(a); + absr = abs(ar); + absi = abs(ai); + + if (ai == 0.) { +/* If alpha is real, then we can use csrscl */ + zdrscl_(n, &ar, &x[1], incx); + + } else if (ar == 0.) { +/* If alpha has a zero real part, then we follow the same rules as if */ +/* alpha were real. */ + if (absi > safmax) { + zdscal_(n, &safmin, &x[1], incx); + d__1 = -safmax / ai; + z__1.r = 0., z__1.i = d__1; + zscal_(n, &z__1, &x[1], incx); + } else if (absi < safmin) { + d__1 = -safmin / ai; + z__1.r = 0., z__1.i = d__1; + zscal_(n, &z__1, &x[1], incx); + zdscal_(n, &safmax, &x[1], incx); + } else { + d__1 = -1. / ai; + z__1.r = 0., z__1.i = d__1; + zscal_(n, &z__1, &x[1], incx); + } + + } else { +/* The following numbers can be computed. */ +/* They are the inverse of the real and imaginary parts of 1/alpha. */ +/* Note that a and b are always different from zero. */ +/* NaNs are only possible if either: */ +/* 1. alphaR or alphaI is NaN. */ +/* 2. alphaR and alphaI are both infinite, in which case it makes sense */ +/* to propagate a NaN. */ + ur = ar + ai * (ai / ar); + ui = ai + ar * (ar / ai); + + if (abs(ur) < safmin || abs(ui) < safmin) { +/* This means that both alphaR and alphaI are very small. */ + d__1 = safmin / ur; + d__2 = -safmin / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + zdscal_(n, &safmax, &x[1], incx); + } else if (abs(ur) > safmax || abs(ui) > safmax) { + if (absr > ov || absi > ov) { +/* This means that a and b are both Inf. No need for scaling. */ + d__1 = 1. / ur; + d__2 = -1. / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + } else { + zdscal_(n, &safmin, &x[1], incx); + if (abs(ur) > ov || abs(ui) > ov) { +/* Infs were generated. We do proper scaling to avoid them. */ + if (absr >= absi) { +/* ABS( UR ) <= ABS( UI ) */ + ur = safmin * ar + safmin * (ai * (ai / ar)); + ui = safmin * ai + ar * (safmin * ar / ai); + } else { +/* ABS( UR ) > ABS( UI ) */ + ur = safmin * ar + ai * (safmin * ai / ar); + ui = safmin * ai + safmin * (ar * (ar / ai)); + } + d__1 = 1. / ur; + d__2 = -1. / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + } else { + d__1 = safmax / ur; + d__2 = -safmax / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + } + } + } else { + d__1 = 1. / ur; + d__2 = -1. / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + } + } + + return 0; + +/* End of ZRSCL */ + +} /* zrscl_ */ + diff --git a/lapack-netlib/SRC/zrscl.f b/lapack-netlib/SRC/zrscl.f new file mode 100644 index 000000000..970f6de75 --- /dev/null +++ b/lapack-netlib/SRC/zrscl.f @@ -0,0 +1,203 @@ +*> \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZDRSCL + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZRSCL( N, A, X, INCX ) +* +* .. Scalar Arguments .. +* INTEGER INCX, N +* COMPLEX*16 A +* .. +* .. Array Arguments .. +* COMPLEX*16 X( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZRSCL multiplies an n-element complex vector x by the complex scalar +*> 1/a. This is done without overflow or underflow as long as +*> the final result x/a does not overflow or underflow. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of components of the vector x. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 +*> The scalar a which is used to divide each component of x. +*> A must not be 0, or the subroutine will divide by zero. +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension +*> (1+(N-1)*abs(INCX)) +*> The n-element vector x. +*> \endverbatim +*> +*> \param[in] INCX +*> \verbatim +*> INCX is INTEGER +*> The increment between successive values of the vector SX. +*> > 0: SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i), 1< i<= n +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16OTHERauxiliary +* +* ===================================================================== + SUBROUTINE ZRSCL( N, A, X, INCX ) +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INCX, N + COMPLEX*16 A +* .. +* .. Array Arguments .. + COMPLEX*16 X( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR, UI +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + COMPLEX*16 ZLADIV + EXTERNAL DLAMCH, ZLADIV +* .. +* .. External Subroutines .. + EXTERNAL DSCAL, ZDSCAL, ZDRSCL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( N.LE.0 ) + $ RETURN +* +* Get machine parameters +* + SAFMIN = DLAMCH( 'S' ) + SAFMAX = ONE / SAFMIN + OV = DLAMCH( 'O' ) +* +* Initialize constants related to A. +* + AR = DBLE( A ) + AI = DIMAG( A ) + ABSR = ABS( AR ) + ABSI = ABS( AI ) +* + IF( AI.EQ.ZERO ) THEN +* If alpha is real, then we can use csrscl + CALL ZDRSCL( N, AR, X, INCX ) +* + ELSE IF( AR.EQ.ZERO ) THEN +* If alpha has a zero real part, then we follow the same rules as if +* alpha were real. + IF( ABSI.GT.SAFMAX ) THEN + CALL ZDSCAL( N, SAFMIN, X, INCX ) + CALL ZSCAL( N, DCMPLX( ZERO, -SAFMAX / AI ), X, INCX ) + ELSE IF( ABSI.LT.SAFMIN ) THEN + CALL ZSCAL( N, DCMPLX( ZERO, -SAFMIN / AI ), X, INCX ) + CALL ZDSCAL( N, SAFMAX, X, INCX ) + ELSE + CALL ZSCAL( N, DCMPLX( ZERO, -ONE / AI ), X, INCX ) + END IF +* + ELSE +* The following numbers can be computed. +* They are the inverse of the real and imaginary parts of 1/alpha. +* Note that a and b are always different from zero. +* NaNs are only possible if either: +* 1. alphaR or alphaI is NaN. +* 2. alphaR and alphaI are both infinite, in which case it makes sense +* to propagate a NaN. + UR = AR + AI * ( AI / AR ) + UI = AI + AR * ( AR / AI ) +* + IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN +* This means that both alphaR and alphaI are very small. + CALL ZSCAL( N, DCMPLX( SAFMIN / UR, -SAFMIN / UI ), X, + $ INCX ) + CALL ZDSCAL( N, SAFMAX, X, INCX ) + ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN + IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN +* This means that a and b are both Inf. No need for scaling. + CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX ) + ELSE + CALL ZDSCAL( N, SAFMIN, X, INCX ) + IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN +* Infs were generated. We do proper scaling to avoid them. + IF( ABSR.GE.ABSI ) THEN +* ABS( UR ) <= ABS( UI ) + UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR )) + UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI ) + ELSE +* ABS( UR ) > ABS( UI ) + UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR ) + UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI )) + END IF + CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, + $ INCX ) + ELSE + CALL ZSCAL( N, DCMPLX( SAFMAX / UR, -SAFMAX / UI ), + $ X, INCX ) + END IF + END IF + ELSE + CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX ) + END IF + END IF +* + RETURN +* +* End of ZRSCL +* + END diff --git a/lapack-netlib/SRC/zstedc.f b/lapack-netlib/SRC/zstedc.f index 74d390af7..e62063a19 100644 --- a/lapack-netlib/SRC/zstedc.f +++ b/lapack-netlib/SRC/zstedc.f @@ -43,12 +43,6 @@ *> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this *> matrix to tridiagonal form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See DLAED3 for details. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zstegr.f b/lapack-netlib/SRC/zstegr.f index ee43a0d21..3736a0517 100644 --- a/lapack-netlib/SRC/zstegr.f +++ b/lapack-netlib/SRC/zstegr.f @@ -56,7 +56,7 @@ *> *> Note : ZSTEGR and ZSTEMR work only on machines which follow *> IEEE-754 floating-point standard in their handling of infinities and -*> NaNs. Normal execution may create these exceptiona values and hence +*> NaNs. Normal execution may create these exceptional values and hence *> may abort due to a floating point exception in environments which *> do not conform to the IEEE-754 standard. *> \endverbatim diff --git a/lapack-netlib/SRC/zstemr.f b/lapack-netlib/SRC/zstemr.f index b034198de..4eaf5ef97 100644 --- a/lapack-netlib/SRC/zstemr.f +++ b/lapack-netlib/SRC/zstemr.f @@ -320,7 +320,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup stemr * *> \par Contributors: * ================== @@ -330,6 +330,7 @@ *> Inderjit Dhillon, University of Texas, Austin, USA \n *> Osni Marques, LBNL/NERSC, USA \n *> Christof Voemel, University of California, Berkeley, USA \n +*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n * * ===================================================================== SUBROUTINE ZSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, @@ -361,7 +362,8 @@ $ MINRGP = 1.0D-3 ) * .. * .. Local Scalars .. - LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY + LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY, + $ LAESWAP INTEGER I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW, $ IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD, $ INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP, @@ -397,6 +399,7 @@ * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) + LAESWAP = .FALSE. * DSTEMR needs WORK of size 6*N, IWORK of size 3*N. * In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N. @@ -519,6 +522,15 @@ ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN ) END IF +* D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However, +* the following code requires R1 >= R2. Hence, we correct +* the order of R1, R2, CS, SN if R1 < R2 before further processing. + IF( R1.LT.R2 ) THEN + E(2) = R1 + R1 = R2 + R2 = E(2) + LAESWAP = .TRUE. + ENDIF IF( ALLEIG.OR. $ (VALEIG.AND.(R2.GT.WL).AND. $ (R2.LE.WU)).OR. @@ -526,8 +538,13 @@ M = M+1 W( M ) = R2 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = -SN - Z( 2, M ) = CS + IF( LAESWAP ) THEN + Z( 1, M ) = CS + Z( 2, M ) = SN + ELSE + Z( 1, M ) = -SN + Z( 2, M ) = CS + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -550,8 +567,13 @@ M = M+1 W( M ) = R1 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = CS - Z( 2, M ) = SN + IF( LAESWAP ) THEN + Z( 1, M ) = -SN + Z( 2, M ) = CS + ELSE + Z( 1, M ) = CS + Z( 2, M ) = SN + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN diff --git a/lapack-netlib/SRC/zsyconvf.f b/lapack-netlib/SRC/zsyconvf.f index eb49b0f3d..0958a5f77 100644 --- a/lapack-netlib/SRC/zsyconvf.f +++ b/lapack-netlib/SRC/zsyconvf.f @@ -39,7 +39,7 @@ *> ZSYTRF provided on entry in parameter A into the factorization *> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored *> on exit in parameters A and E. It also converts in place details of -*> the intechanges stored in IPIV from the format used in ZSYTRF into +*> the interchanges stored in IPIV from the format used in ZSYTRF into *> the format used in ZSYTRF_RK (or ZSYTRF_BK). *> *> If parameter WAY = 'R': @@ -48,7 +48,7 @@ *> (or ZSYTRF_BK) provided on entry in parameters A and E into *> the factorization output format used in ZSYTRF that is stored *> on exit in parameter A. It also converts in place details of -*> the intechanges stored in IPIV from the format used in ZSYTRF_RK +*> the interchanges stored in IPIV from the format used in ZSYTRF_RK *> (or ZSYTRF_BK) into the format used in ZSYTRF. *> *> ZSYCONVF can also convert in Hermitian matrix case, i.e. between @@ -325,7 +325,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -469,7 +469,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -535,7 +535,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/zsyconvf_rook.f b/lapack-netlib/SRC/zsyconvf_rook.f index 3cfa694c3..62cca060b 100644 --- a/lapack-netlib/SRC/zsyconvf_rook.f +++ b/lapack-netlib/SRC/zsyconvf_rook.f @@ -520,7 +520,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f index 4f19630a9..701d73a38 100644 --- a/lapack-netlib/SRC/zsysv_aa_2stage.f +++ b/lapack-netlib/SRC/zsysv_aa_2stage.f @@ -87,7 +87,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zsytrf.f b/lapack-netlib/SRC/zsytrf.f index a775a8758..dc9016c69 100644 --- a/lapack-netlib/SRC/zsytrf.f +++ b/lapack-netlib/SRC/zsytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zsytrf_aa.f b/lapack-netlib/SRC/zsytrf_aa.f index 73d257cfe..ddb19925a 100644 --- a/lapack-netlib/SRC/zsytrf_aa.f +++ b/lapack-netlib/SRC/zsytrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f index b731cb7a2..95b9fda0d 100644 --- a/lapack-netlib/SRC/zsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zsytrf_rk.f b/lapack-netlib/SRC/zsytrf_rk.f index 3b398ce6c..af8b8d501 100644 --- a/lapack-netlib/SRC/zsytrf_rk.f +++ b/lapack-netlib/SRC/zsytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/ztgevc.f b/lapack-netlib/SRC/ztgevc.f index 23bd36ddb..6dcbe582f 100644 --- a/lapack-netlib/SRC/ztgevc.f +++ b/lapack-netlib/SRC/ztgevc.f @@ -53,7 +53,7 @@ *> *> S*x = w*P*x, (y**H)*S = w*(y**H)*P, *> -*> where y**H denotes the conjugate tranpose of y. +*> where y**H denotes the conjugate transpose of y. *> The eigenvalues are not input to this routine, but are computed *> directly from the diagonal elements of S and P. *> @@ -154,7 +154,7 @@ *> \verbatim *> VR is COMPLEX*16 array, dimension (LDVR,MM) *> On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must -*> contain an N-by-N matrix Q (usually the unitary matrix Z +*> contain an N-by-N matrix Z (usually the unitary matrix Z *> of right Schur vectors returned by ZHGEQZ). *> On exit, if SIDE = 'R' or 'B', VR contains: *> if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P); @@ -259,7 +259,7 @@ EXTERNAL LSAME, DLAMCH, ZLADIV * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEMV + EXTERNAL XERBLA, ZGEMV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN @@ -367,7 +367,6 @@ * SAFMIN = DLAMCH( 'Safe minimum' ) BIG = ONE / SAFMIN - CALL DLABAD( SAFMIN, BIG ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) SMALL = SAFMIN*N / ULP BIG = ONE / SMALL diff --git a/lapack-netlib/SRC/ztgsy2.f b/lapack-netlib/SRC/ztgsy2.f index ee26b5e7b..0cae8939e 100644 --- a/lapack-netlib/SRC/ztgsy2.f +++ b/lapack-netlib/SRC/ztgsy2.f @@ -57,7 +57,7 @@ *> Z = [ kron(In, A) -kron(B**H, Im) ] (2) *> [ kron(In, D) -kron(E**H, Im) ], *> -*> Ik is the identity matrix of size k and X**H is the conjuguate transpose of X. +*> Ik is the identity matrix of size k and X**H is the conjugate transpose of X. *> kron(X, Y) is the Kronecker product between the matrices X and Y. *> *> If TRANS = 'C', y in the conjugate transposed system Z**H*y = scale*b diff --git a/lapack-netlib/SRC/ztrevc3.f b/lapack-netlib/SRC/ztrevc3.f index 6300e80ae..8fb144e0c 100644 --- a/lapack-netlib/SRC/ztrevc3.f +++ b/lapack-netlib/SRC/ztrevc3.f @@ -321,9 +321,9 @@ * INFO = 0 NB = ILAENV( 1, 'ZTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK - RWORK(1) = N + RWORK(1) = MAX( 1, N ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/ztrexc.f b/lapack-netlib/SRC/ztrexc.f index b92e63efa..32aae51e4 100644 --- a/lapack-netlib/SRC/ztrexc.f +++ b/lapack-netlib/SRC/ztrexc.f @@ -40,7 +40,7 @@ *> *> The Schur form T is reordered by a unitary similarity transformation *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by -*> postmultplying it with Z. +*> postmultiplying it with Z. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zunbdb1.f b/lapack-netlib/SRC/zunbdb1.f index 767788a74..2fae170de 100644 --- a/lapack-netlib/SRC/zunbdb1.f +++ b/lapack-netlib/SRC/zunbdb1.f @@ -37,7 +37,7 @@ *>\verbatim *> *> ZUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f index 46b08aa1e..28e78fc23 100644 --- a/lapack-netlib/SRC/zunbdb2.f +++ b/lapack-netlib/SRC/zunbdb2.f @@ -37,7 +37,7 @@ *>\verbatim *> *> ZUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/zunbdb3.f b/lapack-netlib/SRC/zunbdb3.f index 74083e41a..9f32a7a88 100644 --- a/lapack-netlib/SRC/zunbdb3.f +++ b/lapack-netlib/SRC/zunbdb3.f @@ -37,7 +37,7 @@ *>\verbatim *> *> ZUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f index 4672cfa67..a1db5eb79 100644 --- a/lapack-netlib/SRC/zunbdb4.f +++ b/lapack-netlib/SRC/zunbdb4.f @@ -38,7 +38,7 @@ *>\verbatim *> *> ZUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/zunbdb5.f b/lapack-netlib/SRC/zunbdb5.f index 23174fe50..c451ae921 100644 --- a/lapack-netlib/SRC/zunbdb5.f +++ b/lapack-netlib/SRC/zunbdb5.f @@ -148,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup unbdb5 * * ===================================================================== SUBROUTINE ZUNBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -169,18 +169,21 @@ * ===================================================================== * * .. Parameters .. + DOUBLE PRECISION REALZERO + PARAMETER ( REALZERO = 0.0D0 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = (1.0D0,0.0D0), ZERO = (0.0D0,0.0D0) ) * .. * .. Local Scalars .. INTEGER CHILDINFO, I, J + DOUBLE PRECISION EPS, NORM, SCL, SSQ * .. * .. External Subroutines .. - EXTERNAL ZUNBDB6, XERBLA + EXTERNAL ZLASSQ, ZUNBDB6, ZSCAL, XERBLA * .. * .. External Functions .. - DOUBLE PRECISION DZNRM2 - EXTERNAL DZNRM2 + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DLAMCH, DZNRM2 * .. * .. Intrinsic Function .. INTRINSIC MAX @@ -213,16 +216,33 @@ RETURN END IF * -* Project X onto the orthogonal complement of Q + EPS = DLAMCH( 'Precision' ) * - CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2, - $ WORK, LWORK, CHILDINFO ) +* Project X onto the orthogonal complement of Q if X is nonzero * -* If the projection is nonzero, then return + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) * - IF( DZNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN - RETURN + IF( NORM .GT. N * EPS ) THEN +* Scale vector to unit norm to avoid problems in the caller code. +* Computing the reciprocal is undesirable but +* * xLASCL cannot be used because of the vector increments and +* * the round-off error has a negligible impact on +* orthogonalization. + CALL ZSCAL( M1, ONE / NORM, X1, INCX1 ) + CALL ZSCAL( M2, ONE / NORM, X2, INCX2 ) + CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, + $ LDQ2, WORK, LWORK, CHILDINFO ) +* +* If the projection is nonzero, then return +* + IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN + RETURN + END IF END IF * * Project each standard basis vector e_1,...,e_M1 in turn, stopping @@ -238,8 +258,8 @@ END DO CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( DZNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO @@ -257,8 +277,8 @@ X2(I) = ONE CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( DZNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f index ed666e449..ddc9dfc61 100644 --- a/lapack-netlib/SRC/zunbdb6.f +++ b/lapack-netlib/SRC/zunbdb6.f @@ -41,9 +41,8 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The Euclidean norm of X must be one and the columns of Q must be -*> orthonormal. The orthogonalized vector will be zero if and only if it -*> lies entirely in the range of Q. +*> The columns of Q must be orthonormal. The orthogonalized vector will +*> be zero if and only if it lies entirely in the range of Q. *> *> The projection is computed with at most two iterations of the *> classical Gram-Schmidt algorithm, see @@ -152,7 +151,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup unbdb6 * * ===================================================================== SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -174,7 +173,7 @@ * * .. Parameters .. DOUBLE PRECISION ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, + PARAMETER ( ALPHA = 0.83D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) COMPLEX*16 NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0), @@ -223,14 +222,16 @@ * EPS = DLAMCH( 'Precision' ) * +* Compute the Euclidean norm of X +* + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) +* * First, project X onto the orthogonal complement of Q's column * space -* -* Christoph Conrads: In debugging mode the norm should be computed -* and an assertion added comparing the norm with one. Alas, Fortran -* never made it into 1989 when assert() was introduced into the C -* programming language. - NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N diff --git a/lapack-netlib/SRC/zuncsd2by1.f b/lapack-netlib/SRC/zuncsd2by1.f index 2d861c2cc..399b598be 100644 --- a/lapack-netlib/SRC/zuncsd2by1.f +++ b/lapack-netlib/SRC/zuncsd2by1.f @@ -211,13 +211,13 @@ *> LRWORK is INTEGER *> The dimension of the array RWORK. *> -*> If LRWORK = -1, then a workspace query is assumed; the routine +*> If LRWORK=-1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK and RWORK *> arrays, returns this value as the first entry of the WORK *> and RWORK array, respectively, and no error message related *> to LWORK or LRWORK is issued by XERBLA. *> \endverbatim -* +*> *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q)) diff --git a/lapack-netlib/SRC/zungtsqr.f b/lapack-netlib/SRC/zungtsqr.f index 4f2d7dfdd..23e28ac5c 100644 --- a/lapack-netlib/SRC/zungtsqr.f +++ b/lapack-netlib/SRC/zungtsqr.f @@ -133,6 +133,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= (M+NB)*N. *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK @@ -302,4 +303,4 @@ * * End of ZUNGTSQR * - END \ No newline at end of file + END diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index b4e2223f7..d4e6f970d 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -54,6 +54,9 @@ add_lapack_test(sgqr.out gqr.in xeigtsts) add_lapack_test(sgsv.out gsv.in xeigtsts) add_lapack_test(scsd.out csd.in xeigtsts) add_lapack_test(slse.out lse.in xeigtsts) +# +# ======== SINGLE DMD EIG TESTS =========================== +add_lapack_test(sdmd.out sdmd.in xdmdeigtsts) endif() if(BUILD_DOUBLE) @@ -85,6 +88,9 @@ add_lapack_test(dgqr.out gqr.in xeigtstd) add_lapack_test(dgsv.out gsv.in xeigtstd) add_lapack_test(dcsd.out csd.in xeigtstd) add_lapack_test(dlse.out lse.in xeigtstd) +# +# ======== DOUBLE DMD EIG TESTS =========================== +add_lapack_test(ddmd.out ddmd.in xdmdeigtstd) endif() if(BUILD_COMPLEX) @@ -114,6 +120,9 @@ add_lapack_test(cgqr.out gqr.in xeigtstc) add_lapack_test(cgsv.out gsv.in xeigtstc) add_lapack_test(ccsd.out csd.in xeigtstc) add_lapack_test(clse.out lse.in xeigtstc) +# +# ======== COMPLEX DMD EIG TESTS =========================== +add_lapack_test(cdmd.out cdmd.in xdmdeigtstc) endif() if(BUILD_COMPLEX16) @@ -145,6 +154,9 @@ add_lapack_test(zgqr.out gqr.in xeigtstz) add_lapack_test(zgsv.out gsv.in xeigtstz) add_lapack_test(zcsd.out csd.in xeigtstz) add_lapack_test(zlse.out lse.in xeigtstz) +# +# ======== COMPLEX16 DMD EIG TESTS =========================== +add_lapack_test(zdmd.out zdmd.in xdmdeigtstz) endif() diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index d252c7fa9..e4c4181b2 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -42,6 +42,8 @@ set(SEIGTST schkee.F sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyl01.f ssyt21.f ssyt22.f) +set(SDMDEIGTST schkdmd.f90) + set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f @@ -59,6 +61,8 @@ set(CEIGTST cchkee.F csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) +set(CDMDEIGTST cchkdmd.f90) + set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) @@ -79,6 +83,8 @@ set(DEIGTST dchkee.F dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyl01.f dsyt21.f dsyt22.f) +set(DDMDEIGTST dchkdmd.f90) + set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f @@ -96,24 +102,36 @@ set(ZEIGTST zchkee.F zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) +set(ZDMDEIGTST zchkdmd.f90) + macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) + target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) + + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(${name} omp pthread) + endif() + #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() if(BUILD_SINGLE) add_eig_executable(xeigtsts ${SEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtsts ${SDMDEIGTST}) endif() if(BUILD_COMPLEX) add_eig_executable(xeigtstc ${CEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstc ${CDMDEIGTST}) endif() if(BUILD_DOUBLE) add_eig_executable(xeigtstd ${DEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstd ${DDMDEIGTST}) endif() if(BUILD_COMPLEX16) add_eig_executable(xeigtstz ${ZEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstz ${ZDMDEIGTST}) endif() diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 942ae6982..9cf0fc95e 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -64,6 +64,8 @@ SEIGTST = schkee.o \ sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \ sstt22.o ssyl01.o ssyt21.o ssyt22.o +SDMDEIGTST = schkdmd.o + CEIGTST = cchkee.o \ cbdt01.o cbdt02.o cbdt03.o cbdt05.o \ cchkbb.o cchkbd.o cchkbk.o cchkbl.o cchkec.o \ @@ -81,6 +83,8 @@ CEIGTST = cchkee.o \ csgt01.o cslect.o csyl01.o\ cstt21.o cstt22.o cunt01.o cunt03.o +CDMDEIGTST = cchkdmd.o + DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \ dsvdch.o dsvdct.o dsxt1.o @@ -101,6 +105,8 @@ DEIGTST = dchkee.o \ dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \ dstt22.o dsyl01.o dsyt21.o dsyt22.o +DDMDEIGTST = dchkdmd.o + ZEIGTST = zchkee.o \ zbdt01.o zbdt02.o zbdt03.o zbdt05.o \ zchkbb.o zchkbd.o zchkbk.o zchkbl.o zchkec.o \ @@ -118,14 +124,28 @@ ZEIGTST = zchkee.o \ zsgt01.o zslect.o zsyl01.o\ zstt21.o zstt22.o zunt01.o zunt03.o +ZDMDEIGTST = zchkdmd.o + .PHONY: all all: single complex double complex16 .PHONY: single complex double complex16 -single: xeigtsts -complex: xeigtstc -double: xeigtstd -complex16: xeigtstz +single: xeigtsts xdmdeigtsts +complex: xeigtstc xdmdeigtstc +double: xeigtstd xdmdeigtstd +complex16: xeigtstz xdmdeigtstz + +xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ @@ -139,6 +159,10 @@ xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +$(SDMDEIGTST): $(FRC) +$(CDMDEIGTST): $(FRC) +$(DDMDEIGTST): $(FRC) +$(ZDMDEIGTST): $(FRC) $(AEIGTST): $(FRC) $(SCIGTST): $(FRC) $(DZIGTST): $(FRC) @@ -155,7 +179,7 @@ clean: cleanobj cleanexe cleanobj: rm -f *.o cleanexe: - rm -f xeigtst* + rm -f xeigtst* xdmdeigtst* schkee.o: schkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< @@ -165,3 +189,11 @@ cchkee.o: cchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< zchkee.o: zchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< +schkdmd.o: schkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +cchkdmd.o: cchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +dchkdmd.o: dchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +zchkdmd.o: zchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< diff --git a/lapack-netlib/TESTING/EIG/alareq.f b/lapack-netlib/TESTING/EIG/alareq.f index 2585a686a..2cbe6db38 100644 --- a/lapack-netlib/TESTING/EIG/alareq.f +++ b/lapack-netlib/TESTING/EIG/alareq.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/EIG/alarqg.f b/lapack-netlib/TESTING/EIG/alarqg.f index 6e2e6e7ec..b9fb88c65 100644 --- a/lapack-netlib/TESTING/EIG/alarqg.f +++ b/lapack-netlib/TESTING/EIG/alarqg.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/EIG/cchkdmd.f90 b/lapack-netlib/TESTING/EIG/cchkdmd.f90 new file mode 100644 index 000000000..a9c181da9 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/cchkdmd.f90 @@ -0,0 +1,721 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! CGEDMD, for computation of the +! Dynamic Mode Decomposition (DMD) +! CGEDMDQ, for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!............................................................ + +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + + use iso_fortran_env + IMPLICIT NONE + integer, parameter :: WP = real32 +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + + COMPLEX(KIND=WP), PARAMETER :: CONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: CZERO = ( 0.0_WP, 0.0_WP ) +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: RES, & + RES1, RESEX, SINGVX, SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: WDUMMY(2) + INTEGER :: IDUMMY(4), ISEED(4) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_XW, & + TMP_EX +!............................................................ + COMPLEX(KIND=WP) :: CMAX + INTEGER :: LCWORK + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: A, AC, & + AU, F, F0, F1, S, W, & + X, X0, Y, Y0, Y1, Z, Z1 + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:) :: CDA, CDR, & + CDL, CEIGS, CEIGSA, CWORK + COMPLEX(KIND=WP) :: CDUMMY(22), CDUM2X2(2,2) +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER :: iNRNK, iWHTSVD, K_traj, LWMINOPT + CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + LOGICAL :: TEST_QRDMD + +!..... external subroutines (BLAS and LAPACK) + EXTERNAL CAXPY, CGEEV, CGEMM, CGEMV, CLASCL +!.....external subroutines DMD package +! subroutines under test + EXTERNAL CGEDMD, CGEDMDQ +!..... external functions (BLAS and LAPACK) + EXTERNAL SCNRM2, SLAMCH + REAL(KIND=WP) :: SCNRM2, SLAMCH + EXTERNAL CLANGE + REAL(KIND=WP) :: CLANGE + EXTERNAL ICAMAX + INTEGER ICAMAX + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX, SIGN +!............................................................ + + + WRITE(*,*) 'COMPLEX CODE TESTING' + + ! The test is always in pairs : ( CGEDMD and CGEDMDQ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + ! This code by default performs tests on CGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = SLAMCH( 'P' ) ! machine precision WP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + READ(*,*) M + WRITE(*,*) 'M = ', M + ! ... and the number of snapshots. + READ(*,*) N + WRITE(*,*) 'N = ', N + + ! Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = M + LDY = M + LDW = N + LDZ = M + LDAU = M + LDS = N + + TMP_XW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F0(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(LDY,N+1) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( S(LDS,N) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( CEIGS(N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + + TOL = 10*M*EPS + TOL2 = 10*M*N*EPS + +!............. + + DO K_traj = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D4 + CMAX = (1.0D1,1.0D1) + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D1 + MODER = 6 + CONDR = 1.0D1 + PIVTNG = 'N' + ! Loop over all parameter MODE values for CLATMR (+-1,..,+-6) + + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE( CDA(M) ) + ALLOCATE( CDL(M) ) + ALLOCATE( CDR(M) ) + + CALL CLATMR( M, M, 'N', ISEED, 'N', CDA, MODE, COND, & + CMAX, RSIGN, GRADE, CDL, MODEL, CONDL, & + CDR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE( CDR ) + DEALLOCATE( CDL ) + DEALLOCATE( CDA ) + DEALLOCATE( IWORK ) + + LCWORK = MAX(1,2*M) + ALLOCATE( CEIGSA(M) ) + ALLOCATE( CWORK(LCWORK) ) + ALLOCATE( WORK(2*M) ) + AC(1:M,1:M) = A(1:M,1:M) + CALL CGEEV( 'N','N', M, AC, LDA, CEIGSA, CDUM2X2, 2, & + CDUM2X2, 2, CWORK, LCWORK, WORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + DEALLOCATE(CWORK) + + TMP = ABS(CEIGSA(ICAMAX(M, CEIGSA, 1))) ! The spectral radius of A + ! Scale the matrix A to have unit spectral radius. + CALL CLASCL( 'G',0, 0, TMP, ONE, M, M, & + A, LDA, INFO ) + CALL CLASCL( 'G',0, 0, TMP, ONE, M, 1, & + CEIGSA, M, INFO ) + ANORM = CLANGE( 'F', M, M, A, LDA, WDUMMY ) + + IF ( K_traj == 2 ) THEN + ! generate data as two trajectories + ! with two inital conditions + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N/2 + CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F(1:M,1:N/2) + Y0(1:M,1:N/2) = F(1:M,2:N/2+1) + + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N-N/2 + CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F(1:M,2:N-N/2+1) + ELSE + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL CGEMV( 'N', M, M, CONE, A, M, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + F0(1:M,1:N+1) = F(1:M,1:N+1) + X0(1:M,1:N) = F0(1:M,1:N) + Y0(1:M,1:N) = F0(1:M,2:N+1) + END IF + + DEALLOCATE( CEIGSA ) +!........................................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' + RESIDS = 'R' + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' + CASE(2) + JOBREF = 'E' + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' + CASE(2) + SCALE = 'C' + CASE(3) + SCALE = 'Y' + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + NRNK = iNRNK + + DO iWHTSVD = 1, 3 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + ! CGEDMD is always tested and its results are also used for + ! comparisons with CGEDMDQ. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, CEIGS, Z, LDZ, RES, & + AU, LDAU, W, LDW, S, LDS, & + CDUMMY, -1, WDUMMY, -1, IDUMMY, -1, INFO ) + + IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) & + .OR. ( INFO < 0 ) ) THEN + WRITE(*,*) 'Call to CGEDMD workspace query failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ', & + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS + STOP + ELSE + !WRITE(*,*) '... done. Workspace length computed.' + END IF + + LCWORK = INT(CDUMMY(LWMINOPT)) + ALLOCATE(CWORK(LCWORK)) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, CEIGS, Z, LDZ, RES, & + AU, LDAU, W, LDW, S, LDS, & + CWORK, LCWORK, WORK, LWORK, IWORK, LIWORK, INFO ) + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to CGEDMD failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVX(1:N) = WORK(1:N) + + !...... CGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from CGEDMD + ! This checks that the returned eigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the Rayleigh quotient + ! returned in W + CALL CGEMM( 'N', 'N', M, K, K, CONE, X, LDX, W, LDW, & + CZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL CAXPY( M, -CONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, SCNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_XW = MAX(TMP_XW, TMP ) + IF ( TMP_XW <= TOL ) THEN + !WRITE(*,*) ' :) .... OK .........CGEDMD PASSED.' + ELSE + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + END IF + !...... CGEDMD check point + + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, X, LDX, & + CZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL CAXPY( M, -CONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, SCNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK .........CGEDMD PASSED.' + ELSE + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL2 + END IF + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, AU, LDAU, CZERO, Y1, LDY ) + + DO i=1, K + CALL CAXPY( M, -CEIGS(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = SCNRM2( M, Y1(1,i), 1) / SCNRM2(M,AU(1,i),1) + END DO + END IF + !...... CGEDMD check point + + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by CGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in CGEDMD,) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = SCNRM2( M, Y1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK ..........CGEDMD PASSED.' + ELSE + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + DEALLOCATE(CWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + +!....................................................................................................... + + IF ( K_traj == 1 ) THEN + + F(1:M,1:N+1) = F0(1:M,1:N+1) + CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, F, LDF, X, LDX, Y, LDY, & + NRNK, TOL, K, CEIGS, Z, LDZ, RES, AU, & + LDAU, W, LDW, S, LDS, CDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + + LCWORK = INT(CDUMMY(LWMINOPT)) + ALLOCATE(CWORK(LCWORK)) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, F, LDF, X, LDX, Y, LDY, & + NRNK, TOL, KQ, CEIGS, Z, LDZ, RES, AU, & + LDAU, W, LDW, S, LDS, CWORK, LCWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to CGEDMDQ failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVQX(1:N) =WORK(1:N) + + !..... ZGEDMDQ check point + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + END IF + !..... CGEDMDQ check point + + !..... CGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F1(1:M,1:N+1) = F0(1:M,1:N+1) + CALL CGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -CONE, F, & + LDF, Y, LDY, CONE, F1, LDF ) + TMP_FQR = CLANGE( 'F', M, N+1, F1, LDF, WORK ) / & + CLANGE( 'F', M, N+1, F0, LDF, WORK ) + IF ( TMP_FQR <= TOL2 ) THEN + !WRITE(*,*) ':) CGEDMDQ ........ PASSED.' + ELSE + WRITE(*,*) ':( CGEDMDQ ........ FAILED.' + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + !..... ZGEDMDQ checkpoint + !..... ZGEDMDQ checkpoint + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL CGEMM( 'N', 'N', M, KQ, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMDQ) + DO i = 1, KQ + ! have a real eigenvalue with real eigenvector + CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = SCNRM2( M, Y1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(KQ)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) '.... OK ........ CGEDMDQ PASSED.' + ELSE + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ CGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + END IF + END IF + + DEALLOCATE(CWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + + END IF + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! iWHTSVD + !write(*,*) 'WHTSVD loop completed' + END DO ! iNRNK -2:-1 + !write(*,*) 'NRNK loop completed' + END DO ! iSCALE 1:4 + !write(*,*) 'SCALE loop completed' + END DO + !write(*,*) 'JOBREF loop completed' + END DO ! iJOBZ + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE( A ) + DEALLOCATE( AC ) + DEALLOCATE( Z ) + DEALLOCATE( F ) + DEALLOCATE( F0 ) + DEALLOCATE( F1 ) + DEALLOCATE( X ) + DEALLOCATE( X0 ) + DEALLOCATE( Y ) + DEALLOCATE( Y0 ) + DEALLOCATE( Y1 ) + DEALLOCATE( AU ) + DEALLOCATE( W ) + DEALLOCATE( S ) + DEALLOCATE( Z1 ) + DEALLOCATE( RES ) + DEALLOCATE( RES1 ) + DEALLOCATE( RESEX ) + DEALLOCATE( CEIGS ) + DEALLOCATE( SINGVX ) + DEALLOCATE( SINGVQX ) + + END DO ! LLOOP + + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for CGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_XW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_z_XV + END IF + + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> CGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> CGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for CGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> CGEDMD and CGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> CGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> CGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/cchkst.f b/lapack-netlib/TESTING/EIG/cchkst.f index 95747d051..ed535e91f 100644 --- a/lapack-netlib/TESTING/EIG/cchkst.f +++ b/lapack-netlib/TESTING/EIG/cchkst.f @@ -364,7 +364,7 @@ *> \verbatim *> D1 is REAL array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by CSTEQR simlutaneously +*> The eigenvalues of A, as computed by CSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -665,8 +665,7 @@ EXTERNAL CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY, $ CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR, $ CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, CUPGTR, - $ SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF, - $ XERBLA + $ SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, CONJG, INT, LOG, MAX, MIN, REAL, SQRT @@ -733,7 +732,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f index e4deb8ac8..668b51a36 100644 --- a/lapack-netlib/TESTING/EIG/cchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f @@ -385,7 +385,7 @@ *> \verbatim *> D1 is REAL array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by CSTEQR simlutaneously +*> The eigenvalues of A, as computed by CSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -683,10 +683,10 @@ EXTERNAL ILAENV, SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF, - $ XERBLA, CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, - $ CLACPY, CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, - $ CSTEMR, CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, + EXTERNAL SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA, + $ CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY, + $ CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR, + $ CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, $ CUPGTR, CHETRD_2STAGE, SLASET * .. * .. Intrinsic Functions .. @@ -754,7 +754,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/cdrgsx.f b/lapack-netlib/TESTING/EIG/cdrgsx.f index c5baeef5e..1729770a2 100644 --- a/lapack-netlib/TESTING/EIG/cdrgsx.f +++ b/lapack-netlib/TESTING/EIG/cdrgsx.f @@ -395,7 +395,7 @@ * .. * .. External Subroutines .. EXTERNAL ALASVM, CGESVD, CGET51, CGGESX, CLACPY, CLAKF2, - $ CLASET, CLATM5, SLABAD, XERBLA + $ CLASET, CLATM5, XERBLA * .. * .. Scalars in Common .. LOGICAL FS @@ -478,7 +478,6 @@ ULPINV = ONE / ULP SMLNUM = SLAMCH( 'S' ) / ULP BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) THRSH2 = TEN*THRESH NTESTT = 0 NERRS = 0 @@ -917,7 +916,7 @@ $ / ' 2: A and B are upper triangular matrices, ', $ / ' 3: A and B are as type 2, but each second diagonal ', $ 'block in A_11 and ', / - $ ' each third diaongal block in A_22 are 2x2 blocks,', + $ ' each third diagonal block in A_22 are 2x2 blocks,', $ / ' 4: A and B are block diagonal matrices, ', $ / ' 5: (A,B) has potentially close or common ', $ 'eigenvalues.', / ) diff --git a/lapack-netlib/TESTING/EIG/cdrgvx.f b/lapack-netlib/TESTING/EIG/cdrgvx.f index 830a39d86..4114b697f 100644 --- a/lapack-netlib/TESTING/EIG/cdrgvx.f +++ b/lapack-netlib/TESTING/EIG/cdrgvx.f @@ -56,7 +56,7 @@ *> *> | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) ) *> -*> where l**H is the conjugate tranpose of l. +*> where l**H is the conjugate transpose of l. *> *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of *> diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f index d15b39d01..729976738 100644 --- a/lapack-netlib/TESTING/EIG/cdrvsg.f +++ b/lapack-netlib/TESTING/EIG/cdrvsg.f @@ -236,7 +236,7 @@ *> *> B COMPLEX array, dimension (LDB , max(NN)) *> Used to hold the Hermitian positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -420,7 +420,7 @@ * .. External Subroutines .. EXTERNAL CHBGV, CHBGVD, CHBGVX, CHEGV, CHEGVD, CHEGVX, $ CHPGV, CHPGVD, CHPGVX, CLACPY, CLASET, CLATMR, - $ CLATMS, CSGT01, SLABAD, SLAFTS, SLASUM, XERBLA + $ CLATMS, CSGT01, SLAFTS, SLASUM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, REAL, SQRT @@ -481,7 +481,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f index 8b8553773..f7d323247 100644 --- a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f +++ b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f @@ -242,7 +242,7 @@ *> *> B COMPLEX array, dimension (LDB , max(NN)) *> Used to hold the Hermitian positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -426,7 +426,7 @@ EXTERNAL LSAME, SLAMCH, SLARND * .. * .. External Subroutines .. - EXTERNAL SLABAD, SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD, + EXTERNAL SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD, $ CHBGVX, CHEGV, CHEGVD, CHEGVX, CHPGV, CHPGVD, $ CHPGVX, CLACPY, CLASET, CLATMR, CLATMS, CSGT01, $ CHEGV_2STAGE @@ -490,7 +490,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/cdrvst.f b/lapack-netlib/TESTING/EIG/cdrvst.f index 9c129c0e8..205f06f0d 100644 --- a/lapack-netlib/TESTING/EIG/cdrvst.f +++ b/lapack-netlib/TESTING/EIG/cdrvst.f @@ -204,7 +204,7 @@ *> Not modified. *> *> D1 REAL array, dimension (max(NN)) -*> The eigenvalues of A, as computed by CSTEQR simlutaneously +*> The eigenvalues of A, as computed by CSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -393,8 +393,8 @@ * .. External Subroutines .. EXTERNAL ALASVM, CHBEV, CHBEVD, CHBEVX, CHEEV, CHEEVD, $ CHEEVR, CHEEVX, CHET21, CHET22, CHPEV, CHPEVD, - $ CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLABAD, - $ SLAFTS, XERBLA + $ CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLAFTS, + $ XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, LOG, MAX, MIN, REAL, SQRT @@ -451,7 +451,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/cdrvst2stg.f b/lapack-netlib/TESTING/EIG/cdrvst2stg.f index 954c7fb87..258f1f370 100644 --- a/lapack-netlib/TESTING/EIG/cdrvst2stg.f +++ b/lapack-netlib/TESTING/EIG/cdrvst2stg.f @@ -204,7 +204,7 @@ *> Not modified. *> *> D1 REAL array, dimension (max(NN)) -*> The eigenvalues of A, as computed by CSTEQR simlutaneously +*> The eigenvalues of A, as computed by CSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -391,7 +391,7 @@ EXTERNAL SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, SLABAD, SLAFTS, XERBLA, CHBEV, CHBEVD, + EXTERNAL ALASVM, SLAFTS, XERBLA, CHBEV, CHBEVD, $ CHBEVX, CHEEV, CHEEVD, CHEEVR, CHEEVX, CHET21, $ CHET22, CHPEV, CHPEVD, CHPEVX, CLACPY, CLASET, $ CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE, @@ -453,7 +453,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/cerrhs.f b/lapack-netlib/TESTING/EIG/cerrhs.f index 0568a6d78..2dd86b8c9 100644 --- a/lapack-netlib/TESTING/EIG/cerrhs.f +++ b/lapack-netlib/TESTING/EIG/cerrhs.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CUNGHR, -*> CUNMHR, CHSEQR, CHSEIN, and CTREVC. +*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CGEHD2, +*> CUNGHR, CUNMHR, CHSEQR, CHSEIN, CTREVC, and CTREVC3. *> \endverbatim * * Arguments: @@ -86,7 +86,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, CGEBAK, CGEBAL, CGEHRD, CHSEIN, CHSEQR, - $ CUNGHR, CUNMHR, CTREVC + $ CUNGHR, CUNMHR, CTREVC, CTREVC3, CGEHD2 * .. * .. Intrinsic Functions .. INTRINSIC REAL @@ -193,6 +193,29 @@ CALL CHKXER( 'CGEHRD', INFOT, NOUT, LERR, OK ) NT = NT + 7 * +* CGEHD2 +* + SRNAMT = 'CGEHD2' + INFOT = 1 + CALL CGEHD2( -1, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEHD2( 0, 0, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEHD2( 0, 2, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEHD2( 1, 1, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEHD2( 0, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEHD2( 2, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + NT = NT + 6 +* * CUNGHR * SRNAMT = 'CUNGHR' @@ -398,6 +421,47 @@ $ RW, INFO ) CALL CHKXER( 'CTREVC', INFOT, NOUT, LERR, OK ) NT = NT + 7 +* +* CTREVC3 +* + SRNAMT = 'CTREVC3' + INFOT = 1 + CALL CTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ 2, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 16 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + NT = NT + 9 END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index eef34b44b..d23eb14ea 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -21,7 +21,7 @@ *> *> \verbatim *> -*> CERRST tests the error exits for CHETRD, CUNGTR, CUNMTR, CHPTRD, +*> CERRST tests the error exits for CHETRD, CHETD2, CUNGTR, CUNMTR, CHPTRD, *> CUNGTR, CUPMTR, CSTEQR, CSTEIN, CPTEQR, CHBTRD, *> CHEEV, CHEEVX, CHEEVD, CHBEV, CHBEVX, CHBEVD, *> CHPEV, CHPEVX, CHPEVD, and CSTEDC. @@ -94,7 +94,7 @@ EXTERNAL CHBEV, CHBEVD, CHBEVX, CHBTRD, CHEEV, CHEEVD, $ CHEEVR, CHEEVX, CHETRD, CHKXER, CHPEV, CHPEVD, $ CHPEVX, CHPTRD, CPTEQR, CSTEDC, CSTEIN, CSTEQR, - $ CUNGTR, CUNMTR, CUPGTR, CUPMTR, + $ CUNGTR, CUNMTR, CUPGTR, CUPMTR, CHETD2, $ CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE, $ CHEEV_2STAGE, CHBEV_2STAGE, CHBEVD_2STAGE, $ CHBEVX_2STAGE, CHETRD_2STAGE, CHETRD_HE2HB, @@ -156,6 +156,20 @@ CALL CHKXER( 'CHETRD', INFOT, NOUT, LERR, OK ) NT = NT + 4 * +* CHETD2 +* + SRNAMT = 'CHETD2' + INFOT = 1 + CALL CHETD2( '/', 0, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHETD2( 'U', -1, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHETD2( 'U', 2, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) + NT = NT + 3 +* * CHETRD_2STAGE * SRNAMT = 'CHETRD_2STAGE' @@ -628,56 +642,56 @@ SRNAMT = 'CHEEVX_2STAGE' INFOT = 1 CALL CHEEVX_2STAGE( '/', 'A', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 1 CALL CHEEVX_2STAGE( 'V', 'A', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHEEVX_2STAGE( 'N', '/', 'U', 0, A, 1, - $ 0.0D0, 1.0D0, 1, 0, 0.0D0, + $ 0.0, 1.0, 1, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEEVX_2STAGE( 'N', 'A', '/', 0, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) INFOT = 4 CALL CHEEVX_2STAGE( 'N', 'A', 'U', -1, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 2, W, 3, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHEEVX_2STAGE( 'N', 'V', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEEVX_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHEEVX_2STAGE( 'N', 'I', 'U', 2, A, 2, - $ 0.0D0, 0.0D0, 2, 1, 0.0D0, + $ 0.0, 0.0, 2, 1, 0.0, $ M, X, Z, 2, W, 3, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 15 CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 0, W, 3, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 17 CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 2, W, 0, RW, IW, I1, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 11 @@ -734,17 +748,17 @@ CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -755,80 +769,80 @@ N = 1 INFOT = 1 CALL CHEEVR_2STAGE( '/', 'A', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 1 CALL CHEEVR_2STAGE( 'V', 'A', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHEEVR_2STAGE( 'N', '/', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEEVR_2STAGE( 'N', 'A', '/', -1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, $ IW( 2*N+1 ), 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEEVR_2STAGE( 'N', 'A', 'U', -1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, $ IW( 2*N+1 ), 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CHEEVR_2STAGE( 'N', 'A', 'U', 2, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHEEVR_2STAGE( 'N', 'V', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 0, 1, 0.0D0, + $ 0.0, 0.0, 0, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 2, A, 2, - $ 0.0D0, 0.0D0, 2, 1, 0.0D0, + $ 0.0, 0.0, 2, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 15 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 0, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ 0.0, 0.0, 1, 1, 0.0, + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ 0.0, 0.0, 1, 1, 0.0, + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ 0.0, 0.0, 1, 1, 0.0, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 @@ -1259,65 +1273,65 @@ SRNAMT = 'CHBEVX_2STAGE' INFOT = 1 CALL CHBEVX_2STAGE( '/', 'A', 'U', 0, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) INFOT = 1 CALL CHBEVX_2STAGE( 'V', 'A', 'U', 0, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHBEVX_2STAGE( 'N', '/', 'U', 0, 0, A, 1, Q, 1, - $ 0.0D0, 1.0D0, 1, 0, 0.0D0, + $ 0.0, 1.0, 1, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHBEVX_2STAGE( 'N', 'A', '/', 0, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) INFOT = 4 CALL CHBEVX_2STAGE( 'N', 'A', 'U', -1, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHBEVX_2STAGE( 'N', 'A', 'U', 0, -1, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 1, A, 1, Q, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 2, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) * INFOT = 9 * CALL CHBEVX_2STAGE( 'V', 'A', 'U', 2, 0, A, 1, Q, 1, -* $ 0.0D0, 0.0D0, 0, 0, 0.0D0, +* $ 0.0, 0.0, 0, 0, 0.0, * $ M, X, Z, 2, W, 0, RW, IW, I3, INFO ) * CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CHBEVX_2STAGE( 'N', 'V', 'U', 1, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 1, 2, 0.0D0, + $ 0.0, 0.0, 1, 2, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 0, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 12 diff --git a/lapack-netlib/TESTING/EIG/chkxer.f b/lapack-netlib/TESTING/EIG/chkxer.f index fd00bb65a..70caf7e0a 100644 --- a/lapack-netlib/TESTING/EIG/chkxer.f +++ b/lapack-netlib/TESTING/EIG/chkxer.f @@ -61,7 +61,7 @@ RETURN * 9999 FORMAT( ' *** Illegal value of parameter number ', I2, - $ ' not detected by ', A6, ' ***' ) + $ ' not detected by ', A, ' ***' ) * * End of CHKXER * diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f index 82d790daa..8a3cd1ae5 100644 --- a/lapack-netlib/TESTING/EIG/csyl01.f +++ b/lapack-netlib/TESTING/EIG/csyl01.f @@ -120,14 +120,16 @@ COMPLEX RMUL * .. * .. Local Arrays .. - COMPLEX A( MAXM, MAXM ), B( MAXN, MAXN ), - $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ X( MAXM, MAXN ), - $ DUML( MAXM ), DUMR( MAXN ), + COMPLEX DUML( MAXM ), DUMR( MAXN ), $ D( MAX( MAXM, MAXN ) ) - REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) + REAL DUM( MAXN ), VM( 2 ) INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X + REAL, DIMENSION(:,:), ALLOCATABLE :: SWORK +* .. * .. External Functions .. LOGICAL SISNAN REAL SLAMCH, CLANGE @@ -139,6 +141,20 @@ * .. Intrinsic Functions .. INTRINSIC ABS, REAL, MAX * .. +* .. Allocate memory dynamically .. + ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * * Get machine parameters @@ -286,6 +302,13 @@ END DO END DO END DO +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (CC, STAT = AllocateStatus) + DEALLOCATE (X, STAT = AllocateStatus) + DEALLOCATE (SWORK, STAT = AllocateStatus) * RETURN * diff --git a/lapack-netlib/TESTING/EIG/dchkdmd.f90 b/lapack-netlib/TESTING/EIG/dchkdmd.f90 new file mode 100644 index 000000000..4fbf7531b --- /dev/null +++ b/lapack-netlib/TESTING/EIG/dchkdmd.f90 @@ -0,0 +1,813 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! DGEDMD for computation of the +! Dynamic Mode Decomposition (DMD) +! DGEDMDQ for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!... ......................................................... +! NOTE: +! When using the Intel MKL 2022.0.3 the subroutine xGESVDQ +! (optionally used in xGEDMD) may cause access violation +! error for x = S, D, C, Z, but only if called with the +! work space query. (At least in our Windows 10 MSVS 2019.) +! The problem can be mitigated by downloading the source +! code of xGESVDQ from the LAPACK repository and use it +! localy instead of the one in the MKL. This seems to +! indicate that the problem is indeed in the MKL. +! This problem did not appear whith Intel MKL 2022.2.0. +! +! NOTE: +! xGESDD seems to have a problem with workspace. In some +! cases the length of the optimal workspace is returned +! smaller than the minimal workspace, as specified in the +! code. As a precaution, all optimal workspaces are +! set as MAX(minimal, optimal). +! Latest implementations of complex xGESDD have different +! length of the real worksapce. We use max value over +! two versions. +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real64 + IMPLICIT NONE + integer, parameter :: WP = real64 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: & + A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,& + Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1 + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: & + DA, DL, DR, REIG, REIGA, REIGQ, IEIG, & + IEIGA, IEIGQ, RES, RES1, RESEX, SINGVX,& + SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: AB(2,2), WDUMMY(2) + INTEGER :: IDUMMY(2), ISEED(4), RJOBDATA(8) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX, XNORM, YNORM +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, KDIFF, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + + LOGICAL TEST_QRDMD +!..... external subroutines (BLAS and LAPACK) + EXTERNAL DAXPY, DGEEV, DGEMM, DGEMV, DLACPY, DLASCL + EXTERNAL DLARNV, DLATMR +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL DGEDMD, DGEDMDQ + +!..... external functions (BLAS and LAPACK) + EXTERNAL DLAMCH, DLANGE, DNRM2 + REAL(KIND=WP) :: DLAMCH, DLANGE, DNRM2 + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX +!............................................................ + + ! The test is always in pairs : ( DGEDMD and DGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on DGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = DLAMCH( 'P' ) ! machine precision DP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + KDIFF = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = MAX(M,N+1) + LDY = MAX(M,N+1) + LDW = N + LDZ = M + LDAU = MAX(M,N+1) + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ! + ! Test the subroutines on real data snapshots. All + ! computation is done in real arithmetic, even when + ! Koopman eigenvalues and modes are real. + ! + ! Allocate memory space + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( DA(M) ) + ALLOCATE( DL(M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( F2(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(M,N+1) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( REIG(N) ) + ALLOCATE( IEIG(N) ) + ALLOCATE( REIGQ(N) ) + ALLOCATE( IEIGQ(N) ) + ALLOCATE( REIGA(M) ) + ALLOCATE( IEIGA(M) ) + ALLOCATE( VA(LDA,M) ) + ALLOCATE( LAMBDA(N,2) ) + ALLOCATE( LAMBDAQ(N,2) ) + ALLOCATE( EIGA(M,2) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( S(N,N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D8 + DMAX = 1.0D2 + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D2 + MODER = 6 + CONDR = 1.0D2 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE(DR(N)) + CALL DLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, & + DMAX, RSIGN, GRADE, DL, MODEL, CONDL, & + DR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE(IWORK) + DEALLOCATE(DR) + + LWORK = 4*M+1 + ALLOCATE(WORK(LWORK)) + AC = A + CALL DGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, & + VA, M, WORK, LWORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + TMP = ZERO + DO i = 1, M + EIGA(i,1) = REIGA(i) + EIGA(i,2) = IEIGA(i) + TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2)) + END DO + + ! Scale A to have the desirable spectral radius. + CALL DLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO ) + CALL DLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO ) + + ! Compute the norm of A + ANORM = DLANGE( 'F', N, N, A, M, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data with two inital conditions + CALL DLARNV(2, ISEED, M, F1(1,1) ) + F1(1:M,1) = 1.0E-10*F1(1:M,1) + DO i = 1, N/2 + CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F1(1:M,1:N/2) + Y0(1:M,1:N/2) = F1(1:M,2:N/2+1) + + CALL DLARNV(2, ISEED, M, F1(1,1) ) + DO i = 1, N-N/2 + CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1) + ELSE + CALL DLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL DGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, & + F(1,i+1), 1 ) + END DO + X0(1:M,1:N) = F(1:M,1:N) + Y0(1:M,1:N) = F(1:M,2:N+1) + END IF + + XNORM = DLANGE( 'F', M, N, X0, LDX, WDUMMY ) + YNORM = DLANGE( 'F', M, N, Y0, LDX, WDUMMY ) +!............................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' ! Ritz vectors will be computed + RESIDS = 'R' ! Residuals will be computed + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' ! Ritz vectors in factored form + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' ! Data for refined Ritz vectors + CASE(2) + JOBREF = 'E' ! Exact DMD vectors + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' ! X data normalized + CASE(2) + SCALE = 'C' ! X normalized, consist. check + CASE(3) + SCALE = 'Y' ! Y data normalized + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + ! Two truncation strategies. The "-2" case for R&D + ! purposes only - it uses possibly low accuracy small + ! singular values, in which case the formulas used in + ! the DMD are highly sensitive. + NRNK = iNRNK + + DO iWHTSVD = 1, 4 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + ! DGEDMD: Workspace query and workspace allocation + CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, & + IDUMMY, -1, INFO ) + + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE( WORK(LWORK) ) + + ! DGEDMD test: CALL DGEDMD + CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,& + IWORK, LIWORK, INFO ) + + SINGVX(1:N) = WORK(1:N) + + !...... DGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from DGEDMD + ! This checks that the returned aigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL DAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, DNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + + IF ( TMP_ZXW > 10*M*EPS ) THEN + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + END IF + + !...... DGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL DAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, DNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M ) + i=1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = DNRM2( M, Y1(1,i), 1) / DNRM2(M,AU(1,i),1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RESEX(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK )/ DLANGE( 'F', M, 2, AU(1,i), M, & + WORK ) + RESEX(i+1) = RESEX(i) + i = i + 2 + END IF + END DO + + END IF + + !...... DGEDMD check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by DGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in DGEDMD,) + i = 1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = DNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RES1(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + !..... store the results for inspection + DO i = 1, K + LAMBDA(i,1) = REIG(i) + LAMBDA(i,2) = IEIG(i) + END DO + + DEALLOCATE(IWORK) + DEALLOCATE(WORK) + + !====================================================================== + ! Now test the DGEDMDQ + !====================================================================== + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + RJOBDATA(2) = 1 + F1 = F + + ! DGEDMDQ test: Workspace query and workspace allocation + CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, & + -1, IDUMMY, -1, INFO ) + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE(WORK(LWORK)) + ! DGEDMDQ test: CALL DGEDMDQ + CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ) + + !..... DGEDMDQ check point + IF ( KQ /= K ) THEN + KDIFF = KDIFF+1 + END IF + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + DO j =1, 3 + write(*,*) j, SINGVX(j), SINGVQX(j) + read(*,*) + END DO + END IF + + !..... DGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F2 = F + CALL DGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, & + LDF, Y, LDY, ONE, F2, LDF ) + TMP_FQR = DLANGE( 'F', M, N+1, F2, LDF, WORK ) / & + DLANGE( 'F', M, N+1, F, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + + !..... DGEDMDQ check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by DGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL DGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in DGEDMDQ) + i = 1 + DO WHILE ( i <= KQ ) + IF ( IEIGQ(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = DNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIGQ(i) + AB(2,1) = -IEIGQ(i) + AB(1,2) = IEIGQ(i) + AB(2,2) = REIGQ(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES1(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) ! LAPACK CALL + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(K)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP > TOL2 ) THEN + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ DGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + STOP + END IF + + END IF + + DO i = 1, KQ + LAMBDAQ(i,1) = REIGQ(i) + LAMBDAQ(i,2) = IEIGQ(i) + END DO + + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + END IF ! TEST_QRDMD +!====================================================================== + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! WHTSVD LOOP + !write(*,*) 'WHTSVD loop completed' + END DO ! NRNK LOOP + !write(*,*) 'NRNK loop completed' + END DO ! SCALE LOOP + !write(*,*) 'SCALE loop completed' + END DO ! JOBF LOOP + !write(*,*) 'JOBREF loop completed' + END DO ! JOBZ LOOP + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE(A) + DEALLOCATE(AC) + DEALLOCATE(DA) + DEALLOCATE(DL) + DEALLOCATE(F) + DEALLOCATE(F1) + DEALLOCATE(F2) + DEALLOCATE(X) + DEALLOCATE(X0) + DEALLOCATE(SINGVX) + DEALLOCATE(SINGVQX) + DEALLOCATE(Y) + DEALLOCATE(Y0) + DEALLOCATE(Y1) + DEALLOCATE(Z) + DEALLOCATE(Z1) + DEALLOCATE(RES) + DEALLOCATE(RES1) + DEALLOCATE(RESEX) + DEALLOCATE(REIG) + DEALLOCATE(IEIG) + DEALLOCATE(REIGQ) + DEALLOCATE(IEIGQ) + DEALLOCATE(REIGA) + DEALLOCATE(IEIGA) + DEALLOCATE(VA) + DEALLOCATE(LAMBDA) + DEALLOCATE(LAMBDAQ) + DEALLOCATE(EIGA) + DEALLOCATE(W) + DEALLOCATE(AU) + DEALLOCATE(S) + +!............................................................ + ! Generate random M-by-M matrix A. Use DLATMR from + END DO ! LLOOP + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for DGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> DGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> DGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for DGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> DGEDMD and DGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'DGEDMD and DGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> DGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> DGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/dchkst.f b/lapack-netlib/TESTING/EIG/dchkst.f index 2e04f68c5..6e02c84fb 100644 --- a/lapack-netlib/TESTING/EIG/dchkst.f +++ b/lapack-netlib/TESTING/EIG/dchkst.f @@ -363,7 +363,7 @@ *> \verbatim *> D1 is DOUBLE PRECISION array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by DSTEQR simlutaneously +*> The eigenvalues of A, as computed by DSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -645,10 +645,10 @@ EXTERNAL ILAENV, DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR, - $ DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, - $ DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, - $ DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA + EXTERNAL DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS, + $ DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ, + $ DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF, + $ DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, LOG, MAX, MIN, SQRT @@ -715,7 +715,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f index 2c98b802d..0fbebc8b7 100644 --- a/lapack-netlib/TESTING/EIG/dchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f @@ -384,7 +384,7 @@ *> \verbatim *> D1 is DOUBLE PRECISION array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by DSTEQR simlutaneously +*> The eigenvalues of A, as computed by DSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -666,10 +666,10 @@ EXTERNAL ILAENV, DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR, - $ DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, - $ DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, - $ DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA, + EXTERNAL DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS, + $ DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ, + $ DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF, + $ DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA, $ DSYTRD_2STAGE * .. * .. Intrinsic Functions .. @@ -737,7 +737,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/ddrgsx.f b/lapack-netlib/TESTING/EIG/ddrgsx.f index b3f5e23f4..82470ead3 100644 --- a/lapack-netlib/TESTING/EIG/ddrgsx.f +++ b/lapack-netlib/TESTING/EIG/ddrgsx.f @@ -400,7 +400,7 @@ EXTERNAL DLCTSX, ILAENV, DLAMCH, DLANGE * .. * .. External Subroutines .. - EXTERNAL ALASVM, DGESVD, DGET51, DGET53, DGGESX, DLABAD, + EXTERNAL ALASVM, DGESVD, DGET51, DGET53, DGGESX, $ DLACPY, DLAKF2, DLASET, DLATM5, XERBLA * .. * .. Intrinsic Functions .. @@ -478,7 +478,6 @@ ULPINV = ONE / ULP SMLNUM = DLAMCH( 'S' ) / ULP BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) THRSH2 = TEN*THRESH NTESTT = 0 NERRS = 0 @@ -984,7 +983,7 @@ $ / ' 2: A and B are upper triangular matrices, ', $ / ' 3: A and B are as type 2, but each second diagonal ', $ 'block in A_11 and ', / - $ ' each third diaongal block in A_22 are 2x2 blocks,', + $ ' each third diagonal block in A_22 are 2x2 blocks,', $ / ' 4: A and B are block diagonal matrices, ', $ / ' 5: (A,B) has potentially close or common ', $ 'eigenvalues.', / ) diff --git a/lapack-netlib/TESTING/EIG/ddrgvx.f b/lapack-netlib/TESTING/EIG/ddrgvx.f index c63762134..3de72eb70 100644 --- a/lapack-netlib/TESTING/EIG/ddrgvx.f +++ b/lapack-netlib/TESTING/EIG/ddrgvx.f @@ -56,7 +56,7 @@ *> *> | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) ) *> -*> where l**H is the conjugate tranpose of l. +*> where l**H is the conjugate transpose of l. *> *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of *> diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f index 2e9d3c643..72c373086 100644 --- a/lapack-netlib/TESTING/EIG/ddrvsg.f +++ b/lapack-netlib/TESTING/EIG/ddrvsg.f @@ -234,7 +234,7 @@ *> *> B DOUBLE PRECISION array, dimension (LDB , max(NN)) *> Used to hold the symmetric positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -399,7 +399,7 @@ EXTERNAL LSAME, DLAMCH, DLARND * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR, + EXTERNAL DLACPY, DLAFTS, DLASET, DLASUM, DLATMR, $ DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV, $ DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA * .. @@ -460,7 +460,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f index 196c6b48e..0fe31cab1 100644 --- a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f +++ b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f @@ -240,7 +240,7 @@ *> *> B DOUBLE PRECISION array, dimension (LDB , max(NN)) *> Used to hold the symmetric positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -408,7 +408,7 @@ EXTERNAL LSAME, DLAMCH, DLARND * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR, + EXTERNAL DLACPY, DLAFTS, DLASET, DLASUM, DLATMR, $ DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV, $ DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA, $ DSYGV_2STAGE @@ -470,7 +470,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/ddrvst.f b/lapack-netlib/TESTING/EIG/ddrvst.f index a25077018..805fd8271 100644 --- a/lapack-netlib/TESTING/EIG/ddrvst.f +++ b/lapack-netlib/TESTING/EIG/ddrvst.f @@ -214,7 +214,7 @@ *> Not modified. *> *> D1 DOUBLE PRECISION array, dimension (max(NN)) -*> The eigenvalues of A, as computed by DSTEQR simlutaneously +*> The eigenvalues of A, as computed by DSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -502,11 +502,11 @@ EXTERNAL DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR, - $ DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, - $ DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, - $ DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, - $ DSYT22, XERBLA + EXTERNAL ALASVM, DLACPY, DLAFTS, DLASET, DLATMR, DLATMS, + $ DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, DSPEVX, + $ DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, DSTT22, + $ DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, DSYT22, + $ XERBLA * .. * .. Scalars in Common .. CHARACTER*32 SRNAMT @@ -574,7 +574,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/ddrvst2stg.f b/lapack-netlib/TESTING/EIG/ddrvst2stg.f index c9a2632bb..e38671e38 100644 --- a/lapack-netlib/TESTING/EIG/ddrvst2stg.f +++ b/lapack-netlib/TESTING/EIG/ddrvst2stg.f @@ -214,7 +214,7 @@ *> Not modified. *> *> D1 DOUBLE PRECISION array, dimension (max(NN)) -*> The eigenvalues of A, as computed by DSTEQR simlutaneously +*> The eigenvalues of A, as computed by DSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -502,7 +502,7 @@ EXTERNAL DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR, + EXTERNAL ALASVM, DLACPY, DLAFTS, DLASET, DLATMR, $ DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, $ DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, $ DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, @@ -577,7 +577,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/derrhs.f b/lapack-netlib/TESTING/EIG/derrhs.f index fec41c0f0..583bebc9b 100644 --- a/lapack-netlib/TESTING/EIG/derrhs.f +++ b/lapack-netlib/TESTING/EIG/derrhs.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> DERRHS tests the error exits for DGEBAK, SGEBAL, SGEHRD, DORGHR, -*> DORMHR, DHSEQR, SHSEIN, and DTREVC. +*> DERRHS tests the error exits for DGEBAK, DGEBAL, DGEHRD, DGEHD2, +*> DORGHR, DORMHR, DHSEQR, DHSEIN, DTREVC, and DTREVC3. *> \endverbatim * * Arguments: @@ -86,7 +86,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, DGEBAK, DGEBAL, DGEHRD, DHSEIN, DHSEQR, - $ DORGHR, DORMHR, DTREVC + $ DORGHR, DORMHR, DTREVC, DTREVC3, DGEHD2 * .. * .. Intrinsic Functions .. INTRINSIC DBLE @@ -194,6 +194,29 @@ CALL CHKXER( 'DGEHRD', INFOT, NOUT, LERR, OK ) NT = NT + 7 * +* DGEHD2 +* + SRNAMT = 'DGEHD2' + INFOT = 1 + CALL DGEHD2( -1, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEHD2( 0, 0, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEHD2( 0, 2, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEHD2( 1, 1, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEHD2( 0, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEHD2( 2, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + NT = NT + 6 +* * DORGHR * SRNAMT = 'DORGHR' @@ -328,7 +351,11 @@ CALL DHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1, $ INFO ) CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK ) - NT = NT + 9 + INFOT = 13 + CALL DHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1, + $ INFO ) + CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK ) + NT = NT + 10 * * DHSEIN * @@ -399,6 +426,43 @@ $ INFO ) CALL CHKXER( 'DTREVC', INFOT, NOUT, LERR, OK ) NT = NT + 7 +* +* DTREVC3 +* + SRNAMT = 'DTREVC3' + INFOT = 1 + CALL DTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ 2, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f index f297e5a7d..7d111e2e0 100644 --- a/lapack-netlib/TESTING/EIG/derrst.f +++ b/lapack-netlib/TESTING/EIG/derrst.f @@ -21,10 +21,10 @@ *> *> \verbatim *> -*> DERRST tests the error exits for DSYTRD, DORGTR, DORMTR, DSPTRD, -*> DOPGTR, DOPMTR, DSTEQR, SSTERF, SSTEBZ, SSTEIN, DPTEQR, DSBTRD, -*> DSYEV, SSYEVX, SSYEVD, DSBEV, SSBEVX, SSBEVD, -*> DSPEV, SSPEVX, SSPEVD, DSTEV, SSTEVX, SSTEVD, and SSTEDC. +*> DERRST tests the error exits for DSYTRD, DSYTD2, DORGTR, DORMTR, DSPTRD, +*> DOPGTR, DOPMTR, DSTEQR, DSTERF, DSTEBZ, DSTEIN, DPTEQR, DSBTRD, +*> DSYEV, DSYEVX, DSYEVD, DSBEV, DSBEVX, DSBEVD, +*> DSPEV, DSPEVX, DSPEVD, DSTEV, DSTEVX, DSTEVD, and DSTEDC. *> DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE, *> DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE, *> DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB, @@ -95,7 +95,7 @@ $ DSBEV, DSBEVD, DSBEVX, DSBTRD, DSPEV, DSPEVD, $ DSPEVX, DSPTRD, DSTEBZ, DSTEDC, DSTEIN, DSTEQR, $ DSTERF, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSYEV, - $ DSYEVD, DSYEVR, DSYEVX, DSYTRD, + $ DSYEVD, DSYEVR, DSYEVX, DSYTRD, DSYTD2, $ DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE, $ DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE, $ DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB, @@ -157,6 +157,20 @@ CALL CHKXER( 'DSYTRD', INFOT, NOUT, LERR, OK ) NT = NT + 4 * +* DSYTD2 +* + SRNAMT = 'DSYTD2' + INFOT = 1 + CALL DSYTD2( '/', 0, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) + NT = NT + 3 +* * DSYTRD_2STAGE * SRNAMT = 'DSYTRD_2STAGE' @@ -721,12 +735,12 @@ CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f index 782d2cd42..0ea481382 100644 --- a/lapack-netlib/TESTING/EIG/dsyl01.f +++ b/lapack-netlib/TESTING/EIG/dsyl01.f @@ -117,13 +117,15 @@ $ SCALE, SCALE3, SMLNUM, TNRM, XNRM * .. * .. Local Arrays .. - DOUBLE PRECISION A( MAXM, MAXM ), B( MAXN, MAXN ), - $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ X( MAXM, MAXN ), - $ DUML( MAXM ), DUMR( MAXN ), + DOUBLE PRECISION DUML( MAXM ), DUMR( MAXN ), $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), - $ SWORK( LDSWORK, 126 ), VM( 2 ) - INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) + $ VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X, + $ SWORK * .. * .. External Functions .. LOGICAL DISNAN @@ -136,6 +138,20 @@ * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX * .. +* .. Allocate memory dynamically .. + ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( SWORK( LDSWORK, 126 ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * * Get machine parameters @@ -280,6 +296,13 @@ END DO END DO END DO +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (CC, STAT = AllocateStatus) + DEALLOCATE (X, STAT = AllocateStatus) + DEALLOCATE (SWORK, STAT = AllocateStatus) * RETURN * diff --git a/lapack-netlib/TESTING/EIG/schkdmd.f90 b/lapack-netlib/TESTING/EIG/schkdmd.f90 new file mode 100644 index 000000000..77e3e46c0 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/schkdmd.f90 @@ -0,0 +1,792 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! SGEDMD for computation of the +! Dynamic Mode Decomposition (DMD) +! SGEDMDQ for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!... ......................................................... +! NOTE: +! When using the Intel MKL 2022.0.3 the subroutine xGESVDQ +! (optionally used in xGEDMD) may cause access violation +! error for x = S, D, C, Z, but only if called with the +! work space query. (At least in our Windows 10 MSVS 2019.) +! The problem can be mitigated by downloading the source +! code of xGESVDQ from the LAPACK repository and use it +! localy instead of the one in the MKL. This seems to +! indicate that the problem is indeed in the MKL. +! This problem did not appear whith Intel MKL 2022.2.0. +! +! NOTE: +! xGESDD seems to have a problem with workspace. In some +! cases the length of the optimal workspace is returned +! smaller than the minimal workspace, as specified in the +! code. As a precaution, all optimal workspaces are +! set as MAX(minimal, optimal). +! Latest implementations of complex xGESDD have different +! length of the real worksapce. We use max value over +! two versions. +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real32 + IMPLICIT NONE + integer, parameter :: WP = real32 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: & + A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,& + Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1 + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: & + DA, DL, DR, REIG, REIGA, REIGQ, IEIG, & + IEIGA, IEIGQ, RES, RES1, RESEX, SINGVX,& + SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: AB(2,2), WDUMMY(2) + INTEGER :: IDUMMY(2), ISEED(4), RJOBDATA(8) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX, XNORM, YNORM +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, KDIFF, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + + LOGICAL TEST_QRDMD +!..... external subroutines (BLAS and LAPACK) + EXTERNAL SAXPY, SGEEV, SGEMM, SGEMV, SLACPY, SLASCL + EXTERNAL SLARNV, SLATMR +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL SGEDMD, SGEDMDQ + +!..... external functions (BLAS and LAPACK) + EXTERNAL SLAMCH, SLANGE, SNRM2 + REAL(KIND=WP) :: SLAMCH, SLANGE, SNRM2 + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX +!............................................................ + + ! The test is always in pairs : ( SGEDMD and SGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on SGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = SLAMCH( 'P' ) ! machine precision SP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + KDIFF = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = MAX(M,N+1) + LDY = MAX(M,N+1) + LDW = N + LDZ = M + LDAU = MAX(M,N+1) + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ! + ! Test the subroutines on real data snapshots. All + ! computation is done in real arithmetic, even when + ! Koopman eigenvalues and modes are real. + ! + ! Allocate memory space + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( DA(M) ) + ALLOCATE( DL(M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( F2(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(M,N+1) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( REIG(N) ) + ALLOCATE( IEIG(N) ) + ALLOCATE( REIGQ(N) ) + ALLOCATE( IEIGQ(N) ) + ALLOCATE( REIGA(M) ) + ALLOCATE( IEIGA(M) ) + ALLOCATE( VA(LDA,M) ) + ALLOCATE( LAMBDA(N,2) ) + ALLOCATE( LAMBDAQ(N,2) ) + ALLOCATE( EIGA(M,2) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( S(N,N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D8 + DMAX = 1.0D2 + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D2 + MODER = 6 + CONDR = 1.0D2 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE(DR(N)) + CALL SLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, & + DMAX, RSIGN, GRADE, DL, MODEL, CONDL, & + DR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE(IWORK) + DEALLOCATE(DR) + + LWORK = 4*M+1 + ALLOCATE(WORK(LWORK)) + AC = A + CALL SGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, & + VA, M, WORK, LWORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + TMP = ZERO + DO i = 1, M + EIGA(i,1) = REIGA(i) + EIGA(i,2) = IEIGA(i) + TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2)) + END DO + + ! Scale A to have the desirable spectral radius. + CALL SLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO ) + CALL SLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO ) + + ! Compute the norm of A + ANORM = SLANGE( 'F', N, N, A, M, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data with two inital conditions + CALL SLARNV(2, ISEED, M, F1(1,1) ) + F1(1:M,1) = 1.0E-10*F1(1:M,1) + DO i = 1, N/2 + CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F1(1:M,1:N/2) + Y0(1:M,1:N/2) = F1(1:M,2:N/2+1) + + CALL SLARNV(2, ISEED, M, F1(1,1) ) + DO i = 1, N-N/2 + CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1) + ELSE + ! single trajectory + CALL SLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL SGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, & + F(1,i+1), 1 ) + END DO + X0(1:M,1:N) = F(1:M,1:N) + Y0(1:M,1:N) = F(1:M,2:N+1) + END IF + + XNORM = SLANGE( 'F', M, N, X0, LDX, WDUMMY ) + YNORM = SLANGE( 'F', M, N, Y0, LDX, WDUMMY ) +!............................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' ! Ritz vectors will be computed + RESIDS = 'R' ! Residuals will be computed + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' ! Ritz vectors in factored form + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' ! Data for refined Ritz vectors + CASE(2) + JOBREF = 'E' ! Exact DMD vectors + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' ! X data normalized + CASE(2) + SCALE = 'C' ! X normalized, consist. check + CASE(3) + SCALE = 'Y' ! Y data normalized + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + ! Two truncation strategies. The "-2" case for R&D + ! purposes only - it uses possibly low accuracy small + ! singular values, in which case the formulas used in + ! the DMD are highly sensitive. + NRNK = iNRNK + + DO iWHTSVD = 1, 4 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + ! SGEDMD: Workspace query and workspace allocation + CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, & + IDUMMY, -1, INFO ) + + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE( WORK(LWORK) ) + + ! SGEDMD test: CALL SGEDMD + CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,& + IWORK, LIWORK, INFO ) + + SINGVX(1:N) = WORK(1:N) + + !...... SGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from SGEDMD + ! This checks that the returned aigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL SAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, SNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + + IF ( TMP_ZXW > 10*M*EPS ) THEN + NFAIL_Z_XV = NFAIL_Z_XV + 1 + END IF + + END IF + + !...... SGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL SAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, SNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_AU = NFAIL_AU + 1 + END IF + + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M ) + i=1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = SNRM2( M, Y1(1,i), 1) / SNRM2(M,AU(1,i),1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RESEX(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK )/ SLANGE( 'F', M, 2, AU(1,i), M, & + WORK ) + RESEX(i+1) = RESEX(i) + i = i + 2 + END IF + END DO + + END IF + + !...... SGEDMD check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by SGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in SGEDMD,) + i = 1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = SNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RES1(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_REZ = NFAIL_REZ + 1 + END IF + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + ! ... store the results for inspection + DO i = 1, K + LAMBDA(i,1) = REIG(i) + LAMBDA(i,2) = IEIG(i) + END DO + + DEALLOCATE(IWORK) + DEALLOCATE(WORK) + + !====================================================================== + ! Now test the SGEDMDQ, if requested. + !====================================================================== + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + RJOBDATA(2) = 1 + F1 = F + + ! SGEDMDQ test: Workspace query and workspace allocation + CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, & + -1, IDUMMY, -1, INFO ) + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE(WORK(LWORK)) + + ! SGEDMDQ test: CALL SGEDMDQ + CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ) + + !..... SGEDMDQ check point + IF ( KQ /= K ) THEN + KDIFF = KDIFF+1 + END IF + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + END IF + + !..... SGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F2 = F + CALL SGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, & + LDF, Y, LDY, ONE, F2, LDF ) + TMP_FQR = SLANGE( 'F', M, N+1, F2, LDF, WORK ) / & + SLANGE( 'F', M, N+1, F, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + + !..... SGEDMDQ checkpoint + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by SGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL SGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in SGEDMDQ) + i = 1 + DO WHILE ( i <= KQ ) + IF ( IEIGQ(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = SNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIGQ(i) + AB(2,1) = -IEIGQ(i) + AB(1,2) = IEIGQ(i) + AB(2,2) = REIGQ(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES1(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) ! LAPACK CALL + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(K)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP > TOL2 ) THEN + NFAIL_REZQ = NFAIL_REZQ + 1 + END IF + + END IF + + DO i = 1, KQ + LAMBDAQ(i,1) = REIGQ(i) + LAMBDAQ(i,2) = IEIGQ(i) + END DO + + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + END IF ! TEST_QRDMD +!====================================================================== + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! WHTSVD LOOP + !write(*,*) 'WHTSVD loop completed' + END DO ! NRNK LOOP + !write(*,*) 'NRNK loop completed' + END DO ! SCALE LOOP + !write(*,*) 'SCALE loop completed' + END DO ! JOBF LOOP + !write(*,*) 'JOBREF loop completed' + END DO ! JOBZ LOOP + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE(A) + DEALLOCATE(AC) + DEALLOCATE(DA) + DEALLOCATE(DL) + DEALLOCATE(F) + DEALLOCATE(F1) + DEALLOCATE(F2) + DEALLOCATE(X) + DEALLOCATE(X0) + DEALLOCATE(SINGVX) + DEALLOCATE(SINGVQX) + DEALLOCATE(Y) + DEALLOCATE(Y0) + DEALLOCATE(Y1) + DEALLOCATE(Z) + DEALLOCATE(Z1) + DEALLOCATE(RES) + DEALLOCATE(RES1) + DEALLOCATE(RESEX) + DEALLOCATE(REIG) + DEALLOCATE(IEIG) + DEALLOCATE(REIGQ) + DEALLOCATE(IEIGQ) + DEALLOCATE(REIGA) + DEALLOCATE(IEIGA) + DEALLOCATE(VA) + DEALLOCATE(LAMBDA) + DEALLOCATE(LAMBDAQ) + DEALLOCATE(EIGA) + DEALLOCATE(W) + DEALLOCATE(AU) + DEALLOCATE(S) + +!............................................................ + ! Generate random M-by-M matrix A. Use DLATMR from + END DO ! LLOOP + + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for SGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> SGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> SGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for SGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> SGEDMD and SGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'SGEDMD and SGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> SGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> SGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/schkst.f b/lapack-netlib/TESTING/EIG/schkst.f index 10622d77a..aecbdfe93 100644 --- a/lapack-netlib/TESTING/EIG/schkst.f +++ b/lapack-netlib/TESTING/EIG/schkst.f @@ -363,7 +363,7 @@ *> \verbatim *> D1 is REAL array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by SSTEQR simlutaneously +*> The eigenvalues of A, as computed by SSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -645,10 +645,10 @@ EXTERNAL ILAENV, SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR, - $ SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, - $ SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, - $ SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA + EXTERNAL SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS, + $ SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ, + $ SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF, + $ SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, LOG, MAX, MIN, REAL, SQRT @@ -715,7 +715,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f index ac5a3fc39..49899a660 100644 --- a/lapack-netlib/TESTING/EIG/schkst2stg.f +++ b/lapack-netlib/TESTING/EIG/schkst2stg.f @@ -384,7 +384,7 @@ *> \verbatim *> D1 is REAL array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by SSTEQR simlutaneously +*> The eigenvalues of A, as computed by SSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -666,10 +666,10 @@ EXTERNAL ILAENV, SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR, - $ SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, - $ SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, - $ SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA, + EXTERNAL SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS, + $ SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ, + $ SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF, + $ SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA, $ SSYTRD_2STAGE * .. * .. Intrinsic Functions .. @@ -737,7 +737,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/sdrgsx.f b/lapack-netlib/TESTING/EIG/sdrgsx.f index ce0d59214..2015cac1c 100644 --- a/lapack-netlib/TESTING/EIG/sdrgsx.f +++ b/lapack-netlib/TESTING/EIG/sdrgsx.f @@ -400,7 +400,7 @@ EXTERNAL SLCTSX, ILAENV, SLAMCH, SLANGE * .. * .. External Subroutines .. - EXTERNAL ALASVM, SGESVD, SGET51, SGET53, SGGESX, SLABAD, + EXTERNAL ALASVM, SGESVD, SGET51, SGET53, SGGESX, $ SLACPY, SLAKF2, SLASET, SLATM5, XERBLA * .. * .. Intrinsic Functions .. @@ -479,7 +479,6 @@ c MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 ) ULPINV = ONE / ULP SMLNUM = SLAMCH( 'S' ) / ULP BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) THRSH2 = TEN*THRESH NTESTT = 0 NERRS = 0 @@ -985,7 +984,7 @@ c MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 ) $ / ' 2: A and B are upper triangular matrices, ', $ / ' 3: A and B are as type 2, but each second diagonal ', $ 'block in A_11 and ', / - $ ' each third diaongal block in A_22 are 2x2 blocks,', + $ ' each third diagonal block in A_22 are 2x2 blocks,', $ / ' 4: A and B are block diagonal matrices, ', $ / ' 5: (A,B) has potentially close or common ', $ 'eigenvalues.', / ) diff --git a/lapack-netlib/TESTING/EIG/sdrgvx.f b/lapack-netlib/TESTING/EIG/sdrgvx.f index f6da720ad..91803f60a 100644 --- a/lapack-netlib/TESTING/EIG/sdrgvx.f +++ b/lapack-netlib/TESTING/EIG/sdrgvx.f @@ -57,7 +57,7 @@ *> *> | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) ) *> -*> where l**H is the conjugate tranpose of l. +*> where l**H is the conjugate transpose of l. *> *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of *> diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f index 877579bcd..0c82b6f49 100644 --- a/lapack-netlib/TESTING/EIG/sdrvsg.f +++ b/lapack-netlib/TESTING/EIG/sdrvsg.f @@ -234,7 +234,7 @@ *> *> B REAL array, dimension (LDB , max(NN)) *> Used to hold the symmetric positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -399,7 +399,7 @@ EXTERNAL LSAME, SLAMCH, SLARND * .. * .. External Subroutines .. - EXTERNAL SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR, + EXTERNAL SLACPY, SLAFTS, SLASET, SLASUM, SLATMR, $ SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV, $ SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA * .. @@ -460,7 +460,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f index ebd169977..38ed61628 100644 --- a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f +++ b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f @@ -240,7 +240,7 @@ *> *> B REAL array, dimension (LDB , max(NN)) *> Used to hold the symmetric positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -408,7 +408,7 @@ EXTERNAL LSAME, SLAMCH, SLARND * .. * .. External Subroutines .. - EXTERNAL SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR, + EXTERNAL SLACPY, SLAFTS, SLASET, SLASUM, SLATMR, $ SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV, $ SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA, $ SSYGV_2STAGE @@ -470,7 +470,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/sdrvst.f b/lapack-netlib/TESTING/EIG/sdrvst.f index ea0cf66f9..be6d33cee 100644 --- a/lapack-netlib/TESTING/EIG/sdrvst.f +++ b/lapack-netlib/TESTING/EIG/sdrvst.f @@ -214,7 +214,7 @@ *> Not modified. *> *> D1 REAL array, dimension (max(NN)) -*> The eigenvalues of A, as computed by SSTEQR simlutaneously +*> The eigenvalues of A, as computed by SSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -502,11 +502,11 @@ EXTERNAL SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR, - $ SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, - $ SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, - $ SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, - $ SSYT22, XERBLA + EXTERNAL ALASVM, SLACPY, SLAFTS, SLASET, SLATMR, SLATMS, + $ SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, SSPEVX, + $ SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, SSTT22, + $ SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, SSYT22, + $ XERBLA * .. * .. Scalars in Common .. CHARACTER*32 SRNAMT @@ -574,7 +574,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/sdrvst2stg.f b/lapack-netlib/TESTING/EIG/sdrvst2stg.f index a13a58b48..e05ec0749 100644 --- a/lapack-netlib/TESTING/EIG/sdrvst2stg.f +++ b/lapack-netlib/TESTING/EIG/sdrvst2stg.f @@ -214,7 +214,7 @@ *> Not modified. *> *> D1 REAL array, dimension (max(NN)) -*> The eigenvalues of A, as computed by SSTEQR simlutaneously +*> The eigenvalues of A, as computed by SSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -502,7 +502,7 @@ EXTERNAL SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR, + EXTERNAL ALASVM, SLACPY, SLAFTS, SLASET, SLATMR, $ SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, $ SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, $ SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, @@ -577,7 +577,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/serrhs.f b/lapack-netlib/TESTING/EIG/serrhs.f index 8f0ff98a1..89b7303cd 100644 --- a/lapack-netlib/TESTING/EIG/serrhs.f +++ b/lapack-netlib/TESTING/EIG/serrhs.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SORGHR, -*> SORMHR, SHSEQR, SHSEIN, and STREVC. +*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SGEHD2, +*> SORGHR, SORMHR, SHSEQR, SHSEIN, STREVC, and STREVC3. *> \endverbatim * * Arguments: @@ -85,7 +85,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, SGEBAK, SGEBAL, SGEHRD, SHSEIN, SHSEQR, - $ SORGHR, SORMHR, STREVC + $ SORGHR, SORMHR, STREVC, STREVC3, SGEHD2 * .. * .. Intrinsic Functions .. INTRINSIC REAL @@ -193,6 +193,29 @@ CALL CHKXER( 'SGEHRD', INFOT, NOUT, LERR, OK ) NT = NT + 7 * +* SGEHD2 +* + SRNAMT = 'SGEHD2' + INFOT = 1 + CALL SGEHD2( -1, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEHD2( 0, 0, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEHD2( 0, 2, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEHD2( 1, 1, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEHD2( 0, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEHD2( 2, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + NT = NT + 6 +* * SORGHR * SRNAMT = 'SORGHR' @@ -327,7 +350,11 @@ CALL SHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1, $ INFO ) CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK ) - NT = NT + 9 + INFOT = 13 + CALL SHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1, + $ INFO ) + CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK ) + NT = NT + 10 * * SHSEIN * @@ -398,6 +425,43 @@ $ INFO ) CALL CHKXER( 'STREVC', INFOT, NOUT, LERR, OK ) NT = NT + 7 +* +* STREVC3 +* + SRNAMT = 'STREVC3' + INFOT = 1 + CALL STREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL STREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ 2, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f index afb6d4faf..408346382 100644 --- a/lapack-netlib/TESTING/EIG/serrst.f +++ b/lapack-netlib/TESTING/EIG/serrst.f @@ -21,7 +21,7 @@ *> *> \verbatim *> -*> SERRST tests the error exits for SSYTRD, SORGTR, SORMTR, SSPTRD, +*> SERRST tests the error exits for SSYTRD, SSYTD2, SORGTR, SORMTR, SSPTRD, *> SOPGTR, SOPMTR, SSTEQR, SSTERF, SSTEBZ, SSTEIN, SPTEQR, SSBTRD, *> SSYEV, SSYEVX, SSYEVD, SSBEV, SSBEVX, SSBEVD, *> SSPEV, SSPEVX, SSPEVD, SSTEV, SSTEVX, SSTEVD, and SSTEDC. @@ -95,7 +95,7 @@ $ SSBEV, SSBEVD, SSBEVX, SSBTRD, SSPEV, SSPEVD, $ SSPEVX, SSPTRD, SSTEBZ, SSTEDC, SSTEIN, SSTEQR, $ SSTERF, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSYEV, - $ SSYEVD, SSYEVR, SSYEVX, SSYTRD, + $ SSYEVD, SSYEVR, SSYEVX, SSYTRD, SSYTD2, $ SSYEVD_2STAGE, SSYEVR_2STAGE, SSYEVX_2STAGE, $ SSYEV_2STAGE, SSBEV_2STAGE, SSBEVD_2STAGE, $ SSBEVX_2STAGE, SSYTRD_2STAGE, SSYTRD_SY2SB, @@ -157,6 +157,20 @@ CALL CHKXER( 'SSYTRD', INFOT, NOUT, LERR, OK ) NT = NT + 4 * +* SSYTD2 +* + SRNAMT = 'SSYTD2' + INFOT = 1 + CALL SSYTD2( '/', 0, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) + NT = NT + 3 +* * SSYTRD_2STAGE * SRNAMT = 'SSYTRD_2STAGE' @@ -719,12 +733,12 @@ CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f index 22d089dc8..fda30a3c0 100644 --- a/lapack-netlib/TESTING/EIG/ssyl01.f +++ b/lapack-netlib/TESTING/EIG/ssyl01.f @@ -117,13 +117,15 @@ $ SCALE, SCALE3, SMLNUM, TNRM, XNRM * .. * .. Local Arrays .. - REAL A( MAXM, MAXM ), B( MAXN, MAXN ), - $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ X( MAXM, MAXN ), - $ DUML( MAXM ), DUMR( MAXN ), + REAL DUML( MAXM ), DUMR( MAXN ), $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), - $ SWORK( LDSWORK, 54 ), VM( 2 ) - INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) + $ VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X, + $ SWORK * .. * .. External Functions .. LOGICAL SISNAN @@ -136,6 +138,20 @@ * .. Intrinsic Functions .. INTRINSIC ABS, REAL, MAX * .. +* .. Allocate memory dynamically .. + ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * * Get machine parameters @@ -280,6 +296,13 @@ END DO END DO END DO +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (CC, STAT = AllocateStatus) + DEALLOCATE (X, STAT = AllocateStatus) + DEALLOCATE (SWORK, STAT = AllocateStatus) * RETURN * diff --git a/lapack-netlib/TESTING/EIG/zchkdmd.f90 b/lapack-netlib/TESTING/EIG/zchkdmd.f90 new file mode 100644 index 000000000..873d956c4 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/zchkdmd.f90 @@ -0,0 +1,745 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! ZGEDMD, for computation of the +! Dynamic Mode Decomposition (DMD) +! ZGEDMDQ, for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!............................................................ + +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real64 + IMPLICIT NONE + integer, parameter :: WP = real64 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + + COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: RES, & + RES1, RESEX, SINGVX, SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: WDUMMY(2) + INTEGER :: IDUMMY(4), ISEED(4) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX + +!............................................................ + COMPLEX(KIND=WP) :: ZMAX + INTEGER :: LZWORK + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: ZA, ZAC, & + ZAU, ZF, ZF0, ZF1, ZS, ZW, & + ZX, ZX0, ZY, ZY0, ZY1, ZZ, ZZ1 + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:) :: ZDA, ZDR, & + ZDL, ZEIGS, ZEIGSA, ZWORK + COMPLEX(KIND=WP) :: ZDUMMY(22), ZDUM2X2(2,2) +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK, NRNKsp + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD, & + WHTSVDsp + INTEGER :: iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + LOGICAL :: TEST_QRDMD + +!.....external subroutines (BLAS and LAPACK) + EXTERNAL DAXPY, DGEEV, DGEMM, DGEMV, DLACPY, DLASCL + EXTERNAL ZGEEV, ZGEMV, ZLASCL + EXTERNAL ZLARNV, ZLATMR + EXTERNAL ZAXPY, ZGEMM +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL ZGEDMD, ZGEDMDQ +!.....external functions (BLAS and LAPACK) + EXTERNAL DLAMCH, DZNRM2 + REAL(KIND=WP) :: DLAMCH, DZNRM2 + REAL(KIND=WP) :: ZLANGE + EXTERNAL IZAMAX + INTEGER IZAMAX + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX, SIGN +!............................................................ + + ! The test is always in pairs : ( ZGEDMD and ZGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on ZGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = DLAMCH( 'P' ) ! machine precision DP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = M + LDY = M + LDW = N + LDZ = M + LDAU = M + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ALLOCATE( ZA(LDA,M) ) + ALLOCATE( ZAC(LDA,M) ) + ALLOCATE( ZF(LDF,N+1) ) + ALLOCATE( ZF0(LDF,N+1) ) + ALLOCATE( ZF1(LDF,N+1) ) + ALLOCATE( ZX(LDX,N) ) + ALLOCATE( ZX0(LDX,N) ) + ALLOCATE( ZY(LDY,N+1) ) + ALLOCATE( ZY0(LDY,N+1) ) + ALLOCATE( ZY1(LDY,N+1) ) + ALLOCATE( ZAU(LDAU,N) ) + ALLOCATE( ZW(LDW,N) ) + ALLOCATE( ZS(LDS,N) ) + ALLOCATE( ZZ(LDZ,N) ) + ALLOCATE( ZZ1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( ZEIGS(N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D4 + ZMAX = (1.0D1,1.0D1) + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D1 + MODER = 6 + CONDR = 1.0D1 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE( ZDA(M) ) + ALLOCATE( ZDL(M) ) + ALLOCATE( ZDR(M) ) + + CALL ZLATMR( M, M, 'N', ISEED, 'N', ZDA, MODE, COND, & + ZMAX, RSIGN, GRADE, ZDL, MODEL, CONDL, & + ZDR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', ZA, LDA, IWORK(M+1), INFO ) + DEALLOCATE( ZDR ) + DEALLOCATE( ZDL ) + DEALLOCATE( ZDA ) + DEALLOCATE( IWORK ) + + LZWORK = MAX(1,2*M) + ALLOCATE( ZEIGSA(M) ) + ALLOCATE( ZWORK(LZWORK) ) + ALLOCATE( WORK(2*M) ) + ZAC(1:M,1:M) = ZA(1:M,1:M) + CALL ZGEEV( 'N','N', M, ZAC, LDA, ZEIGSA, ZDUM2X2, 2, & + ZDUM2X2, 2, ZWORK, LZWORK, WORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + DEALLOCATE(ZWORK) + + TMP = ABS(ZEIGSA(IZAMAX(M, ZEIGSA, 1))) ! The spectral radius of ZA + ! Scale the matrix ZA to have unit spectral radius. + CALL ZLASCL( 'G',0, 0, TMP, ONE, M, M, & + ZA, LDA, INFO ) + CALL ZLASCL( 'G',0, 0, TMP, ONE, M, 1, & + ZEIGSA, M, INFO ) + ANORM = ZLANGE( 'F', M, M, ZA, LDA, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data as two trajectories + ! with two inital conditions + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N/2 + CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZX0(1:M,1:N/2) = ZF(1:M,1:N/2) + ZY0(1:M,1:N/2) = ZF(1:M,2:N/2+1) + + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N-N/2 + CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZX0(1:M,N/2+1:N) = ZF(1:M,1:N-N/2) + ZY0(1:M,N/2+1:N) = ZF(1:M,2:N-N/2+1) + ELSE + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N + CALL ZGEMV( 'N', M, M, ZONE, ZA, M, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZF0(1:M,1:N+1) = ZF(1:M,1:N+1) + ZX0(1:M,1:N) = ZF0(1:M,1:N) + ZY0(1:M,1:N) = ZF0(1:M,2:N+1) + END IF + + DEALLOCATE( ZEIGSA ) +!........................................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' + RESIDS = 'R' + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' + CASE(2) + JOBREF = 'E' + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' + CASE(2) + SCALE = 'C' + CASE(3) + SCALE = 'Y' + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + NRNK = iNRNK + NRNKsp = iNRNK + + DO iWHTSVD = 1, 3 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + WHTSVDsp = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + ! ZGEDMD is always tested and its results are also used for + ! comparisons with ZGEDMDQ. + + ZX(1:M,1:N) = ZX0(1:M,1:N) + ZY(1:M,1:N) = ZY0(1:M,1:N) + + CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, ZX, LDX, ZY, LDY, NRNK, TOL, & + K, ZEIGS, ZZ, LDZ, RES, ZAU, LDAU, & + ZW, LDW, ZS, LDS, ZDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) & + .OR. ( INFO < 0 ) ) THEN + WRITE(*,*) 'Call to ZGEDMD workspace query failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ', & + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS + STOP + END IF + + LZWORK = INT(ZDUMMY(LWMINOPT)) + LWORK = INT(WDUMMY(1)) + LIWORK = IDUMMY(1) + + ALLOCATE(ZWORK(LZWORK)) + ALLOCATE( WORK(LWORK)) + ALLOCATE(IWORK(LIWORK)) + + CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, ZX, LDX, ZY, LDY, NRNK, TOL, & + K, ZEIGS, ZZ, LDZ, RES, ZAU, LDAU, & + ZW, LDW, ZS, LDS, ZWORK, LZWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to ZGEDMD failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + + SINGVX(1:N) = WORK(1:N) + + !...... ZGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from ZGEDMD + ! This checks that the returned eigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL ZGEMM( 'N', 'N', M, K, K, ZONE, ZX, LDX, ZW, LDW, & + ZZERO, ZZ1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL ZAXPY( M, -ZONE, ZZ(1,i), 1, ZZ1(1,i), 1) + TMP = MAX(TMP, DZNRM2( M, ZZ1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + IF ( TMP_ZXW <= 10*M*EPS ) THEN + !WRITE(*,*) ' :) .... OK .........ZGEDMD PASSED.' + ELSE + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + END IF + + + !...... ZGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZX, LDX, & + ZZERO, ZZ1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL ZAXPY( M, -ZONE, ZAU(1,i), 1, ZZ1(1,i), 1) + TMP = MAX( TMP, DZNRM2( M, ZZ1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK .........ZGEDMD PASSED.' + ELSE + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZAU, LDAU, ZZERO, ZY1, LDY ) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZAU(1,i), 1, ZY1(1,i), 1 ) + RESEX(i) = DZNRM2( M, ZY1(1,i), 1) / DZNRM2(M,ZAU(1,i),1) + END DO + END IF + !...... ZGEDMD check point + + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMD,) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 ) + RES1(i) = DZNRM2( M, ZY1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK ..........ZGEDMD PASSED.' + ELSE + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + DEALLOCATE(ZWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + + ZF(1:M,1:N+1) = ZF0(1:M,1:N+1) + + CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, ZF, LDF, ZX, LDX, ZY, LDY, & + NRNK, TOL, K, ZEIGS, ZZ, LDZ, RES, ZAU, & + LDAU, ZW, LDW, ZS, LDS, ZDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + + LZWORK = INT(ZDUMMY(LWMINOPT)) + ALLOCATE( ZWORK(LZWORK) ) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, ZF, LDF, ZX, LDX, ZY, LDY, & + NRNK, TOL, KQ, ZEIGS, ZZ, LDZ, RES, ZAU, & + LDAU, ZW, LDW, ZS, LDS, ZWORK, LZWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to ZGEDMDQ failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVQX(1:N) = WORK(1:N) + + !..... ZGEDMDQ check point + + IF ( 1 == 0 ) THEN + ! Comparison of ZGEDMD and ZGEDMDQ singular values disabled + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + DO j =1, 3 + write(*,*) j, SINGVX(j), SINGVQX(j) + read(*,*) + END DO + + END IF + END IF + + !..... ZGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + ZF1(1:M,1:N+1) = ZF0(1:M,1:N+1) + CALL ZGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ZONE, ZF, & + LDF, ZY, LDY, ZONE, ZF1, LDF ) + TMP_FQR = ZLANGE( 'F', M, N+1, ZF1, LDF, WORK ) / & + ZLANGE( 'F', M, N+1, ZF0, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_F_QR = NFAIL_F_QR + 1 + ELSE + !WRITE(*,*) '........ PASSED.' + END IF + END IF + + !..... ZGEDMDQ check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL ZGEMM( 'N', 'N', M, KQ, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMDQ) + + DO i=1, KQ + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = DZNRM2( M, ZY1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(KQ)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) '.... OK ........ ZGEDMDQ PASSED.' + ELSE + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ ZGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + STOP + END IF + + END IF + + DEALLOCATE( ZWORK ) + DEALLOCATE( WORK ) + DEALLOCATE( IWORK ) + + END IF ! ZGEDMDQ + +!....................................................................................................... + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! iWHTSVD + !write(*,*) 'WHTSVD loop completed' + END DO ! iNRNK -2:-1 + !write(*,*) 'NRNK loop completed' + END DO ! iSCALE 1:4 + !write(*,*) 'SCALE loop completed' + END DO + !write(*,*) 'JOBREF loop completed' + END DO ! iJOBZ + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE( ZA ) + DEALLOCATE( ZAC ) + DEALLOCATE( ZZ ) + DEALLOCATE( ZF ) + DEALLOCATE( ZF0 ) + DEALLOCATE( ZF1 ) + DEALLOCATE( ZX ) + DEALLOCATE( ZX0 ) + DEALLOCATE( ZY ) + DEALLOCATE( ZY0 ) + DEALLOCATE( ZY1 ) + DEALLOCATE( ZAU ) + DEALLOCATE( ZW ) + DEALLOCATE( ZS ) + DEALLOCATE( ZZ1 ) + DEALLOCATE( RES ) + DEALLOCATE( RES1 ) + DEALLOCATE( RESEX ) + DEALLOCATE( ZEIGS ) + DEALLOCATE( SINGVX ) + DEALLOCATE( SINGVQX ) + + END DO ! LLOOP + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for ZGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> ZGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> ZGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for ZGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> ZGEDMD and ZGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> ZGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> ZGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/zchkst.f b/lapack-netlib/TESTING/EIG/zchkst.f index 60496dde1..b77d94438 100644 --- a/lapack-netlib/TESTING/EIG/zchkst.f +++ b/lapack-netlib/TESTING/EIG/zchkst.f @@ -364,7 +364,7 @@ *> \verbatim *> D1 is DOUBLE PRECISION array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by ZSTEQR simlutaneously +*> The eigenvalues of A, as computed by ZSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -662,11 +662,10 @@ EXTERNAL ILAENV, DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF, - $ XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, - $ ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, - $ ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, - $ ZUPGTR + EXTERNAL DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA, + $ ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY, + $ ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR, + $ ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT @@ -733,7 +732,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f index b1ef80816..3b333ef1c 100644 --- a/lapack-netlib/TESTING/EIG/zchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f @@ -385,7 +385,7 @@ *> \verbatim *> D1 is DOUBLE PRECISION array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by ZSTEQR simlutaneously +*> The eigenvalues of A, as computed by ZSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -683,11 +683,11 @@ EXTERNAL ILAENV, DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF, - $ XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, - $ ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, - $ ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, - $ ZUPGTR, ZHETRD_2STAGE, DLASET + EXTERNAL DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA, + $ ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY, + $ ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR, + $ ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR, + $ ZHETRD_2STAGE, DLASET * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT @@ -754,7 +754,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/zdrgsx.f b/lapack-netlib/TESTING/EIG/zdrgsx.f index 096ed3c22..a486873fa 100644 --- a/lapack-netlib/TESTING/EIG/zdrgsx.f +++ b/lapack-netlib/TESTING/EIG/zdrgsx.f @@ -394,7 +394,7 @@ EXTERNAL ZLCTSX, ILAENV, DLAMCH, ZLANGE * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, XERBLA, ZGESVD, ZGET51, ZGGESX, + EXTERNAL ALASVM, XERBLA, ZGESVD, ZGET51, ZGGESX, $ ZLACPY, ZLAKF2, ZLASET, ZLATM5 * .. * .. Scalars in Common .. @@ -479,7 +479,6 @@ ULPINV = ONE / ULP SMLNUM = DLAMCH( 'S' ) / ULP BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) THRSH2 = TEN*THRESH NTESTT = 0 NERRS = 0 @@ -918,7 +917,7 @@ $ / ' 2: A and B are upper triangular matrices, ', $ / ' 3: A and B are as type 2, but each second diagonal ', $ 'block in A_11 and ', / - $ ' each third diaongal block in A_22 are 2x2 blocks,', + $ ' each third diagonal block in A_22 are 2x2 blocks,', $ / ' 4: A and B are block diagonal matrices, ', $ / ' 5: (A,B) has potentially close or common ', $ 'eigenvalues.', / ) diff --git a/lapack-netlib/TESTING/EIG/zdrgvx.f b/lapack-netlib/TESTING/EIG/zdrgvx.f index 72e0303fb..813151f13 100644 --- a/lapack-netlib/TESTING/EIG/zdrgvx.f +++ b/lapack-netlib/TESTING/EIG/zdrgvx.f @@ -55,7 +55,7 @@ *> *> | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) ) *> -*> where l**H is the conjugate tranpose of l. +*> where l**H is the conjugate transpose of l. *> *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of *> diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f index 71f1d6371..1796805ea 100644 --- a/lapack-netlib/TESTING/EIG/zdrvsg.f +++ b/lapack-netlib/TESTING/EIG/zdrvsg.f @@ -236,7 +236,7 @@ *> *> B COMPLEX*16 array, dimension (LDB , max(NN)) *> Used to hold the Hermitian positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -418,7 +418,7 @@ EXTERNAL LSAME, DLAMCH, DLARND * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD, + EXTERNAL DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD, $ ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD, $ ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01 * .. @@ -481,7 +481,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f index 4bdf2849e..c5ef4ce70 100644 --- a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f +++ b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f @@ -242,7 +242,7 @@ *> *> B COMPLEX*16 array, dimension (LDB , max(NN)) *> Used to hold the Hermitian positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -426,7 +426,7 @@ EXTERNAL LSAME, DLAMCH, DLARND * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD, + EXTERNAL DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD, $ ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD, $ ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01, $ ZHEGV_2STAGE @@ -490,7 +490,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/zdrvst.f b/lapack-netlib/TESTING/EIG/zdrvst.f index 384e58de1..f838f0af2 100644 --- a/lapack-netlib/TESTING/EIG/zdrvst.f +++ b/lapack-netlib/TESTING/EIG/zdrvst.f @@ -204,7 +204,7 @@ *> Not modified. *> *> D1 DOUBLE PRECISION array, dimension (max(NN)) -*> The eigenvalues of A, as computed by ZSTEQR simlutaneously +*> The eigenvalues of A, as computed by ZSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -391,10 +391,10 @@ EXTERNAL DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD, - $ ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, - $ ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, - $ ZLATMR, ZLATMS + EXTERNAL ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD, ZHBEVX, + $ ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, ZHET22, + $ ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, ZLATMR, + $ ZLATMS * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, LOG, MAX, MIN, SQRT @@ -451,7 +451,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/zdrvst2stg.f b/lapack-netlib/TESTING/EIG/zdrvst2stg.f index 4a88e5218..4b989b460 100644 --- a/lapack-netlib/TESTING/EIG/zdrvst2stg.f +++ b/lapack-netlib/TESTING/EIG/zdrvst2stg.f @@ -204,7 +204,7 @@ *> Not modified. *> *> D1 DOUBLE PRECISION array, dimension (max(NN)) -*> The eigenvalues of A, as computed by ZSTEQR simlutaneously +*> The eigenvalues of A, as computed by ZSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -391,7 +391,7 @@ EXTERNAL DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD, + EXTERNAL ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD, $ ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, $ ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, $ ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE, @@ -453,7 +453,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/zerrhs.f b/lapack-netlib/TESTING/EIG/zerrhs.f index 582338947..3a70a556e 100644 --- a/lapack-netlib/TESTING/EIG/zerrhs.f +++ b/lapack-netlib/TESTING/EIG/zerrhs.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> ZERRHS tests the error exits for ZGEBAK, CGEBAL, CGEHRD, ZUNGHR, -*> ZUNMHR, ZHSEQR, CHSEIN, and ZTREVC. +*> ZERRHS tests the error exits for ZGEBAK, ZGEBAL, ZGEHRD, ZGEHD2, +*> ZUNGHR, ZUNMHR, ZHSEQR, ZHSEIN, ZTREVC, and ZTREVC3. *> \endverbatim * * Arguments: @@ -86,7 +86,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, ZGEBAK, ZGEBAL, ZGEHRD, ZHSEIN, ZHSEQR, - $ ZTREVC, ZUNGHR, ZUNMHR + $ ZUNGHR, ZUNMHR, ZTREVC, ZTREVC3 * .. * .. Intrinsic Functions .. INTRINSIC DBLE @@ -193,6 +193,29 @@ CALL CHKXER( 'ZGEHRD', INFOT, NOUT, LERR, OK ) NT = NT + 7 * +* ZGEHD2 +* + SRNAMT = 'ZGEHD2' + INFOT = 1 + CALL ZGEHD2( -1, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEHD2( 0, 0, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEHD2( 0, 2, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEHD2( 1, 1, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEHD2( 0, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEHD2( 2, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + NT = NT + 6 +* * ZUNGHR * SRNAMT = 'ZUNGHR' @@ -389,6 +412,47 @@ $ INFO ) CALL CHKXER( 'ZTREVC', INFOT, NOUT, LERR, OK ) NT = NT + 7 +* +* ZTREVC3 +* + SRNAMT = 'ZTREVC3' + INFOT = 1 + CALL ZTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ 2, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 16 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + NT = NT + 9 END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f index 5b0e6f820..31881c4de 100644 --- a/lapack-netlib/TESTING/EIG/zerrst.f +++ b/lapack-netlib/TESTING/EIG/zerrst.f @@ -23,7 +23,7 @@ *> *> \verbatim *> -*> ZERRST tests the error exits for ZHETRD, ZUNGTR, CUNMTR, ZHPTRD, +*> ZERRST tests the error exits for ZHETRD, ZHETD2, ZUNGTR, CUNMTR, ZHPTRD, *> ZUNGTR, ZUPMTR, ZSTEQR, CSTEIN, ZPTEQR, ZHBTRD, *> ZHEEV, CHEEVX, CHEEVD, ZHBEV, CHBEVX, CHBEVD, *> ZHPEV, CHPEVX, CHPEVD, and ZSTEDC. @@ -95,7 +95,7 @@ EXTERNAL CHKXER, ZHBEV, ZHBEVD, ZHBEVX, ZHBTRD, ZHEEV, $ ZHEEVD, ZHEEVR, ZHEEVX, ZHETRD, ZHPEV, ZHPEVD, $ ZHPEVX, ZHPTRD, ZPTEQR, ZSTEDC, ZSTEIN, ZSTEQR, - $ ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR, + $ ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR, ZHETD2, $ ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE, $ ZHEEV_2STAGE, ZHBEV_2STAGE, ZHBEVD_2STAGE, $ ZHBEVX_2STAGE, ZHETRD_2STAGE @@ -156,6 +156,20 @@ CALL CHKXER( 'ZHETRD', INFOT, NOUT, LERR, OK ) NT = NT + 4 * +* ZHETD2 +* + SRNAMT = 'ZHETD2' + INFOT = 1 + CALL ZHETD2( '/', 0, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, INFO ) + CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) + NT = NT + 3 +* * ZHETRD_2STAGE * SRNAMT = 'ZHETRD_2STAGE' @@ -734,17 +748,17 @@ CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -816,19 +830,19 @@ INFOT = 18 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f index 329f39dc4..5d26d494c 100644 --- a/lapack-netlib/TESTING/EIG/zsyl01.f +++ b/lapack-netlib/TESTING/EIG/zsyl01.f @@ -120,14 +120,16 @@ COMPLEX*16 RMUL * .. * .. Local Arrays .. - COMPLEX*16 A( MAXM, MAXM ), B( MAXN, MAXN ), - $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ X( MAXM, MAXN ), - $ DUML( MAXM ), DUMR( MAXN ), + COMPLEX*16 DUML( MAXM ), DUMR( MAXN ), $ D( MAX( MAXM, MAXN ) ) - DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) + DOUBLE PRECISION DUM( MAXN ), VM( 2 ) INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: SWORK +* .. * .. External Functions .. LOGICAL DISNAN DOUBLE PRECISION DLAMCH, ZLANGE @@ -139,6 +141,20 @@ * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, SQRT * .. +* .. Allocate memory dynamically .. + ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( SWORK( LDSWORK, 103 ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * * Get machine parameters @@ -286,6 +302,13 @@ END DO END DO END DO +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (CC, STAT = AllocateStatus) + DEALLOCATE (X, STAT = AllocateStatus) + DEALLOCATE (SWORK, STAT = AllocateStatus) * RETURN * diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index fc55b8a96..e406570e1 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -9,7 +9,7 @@ set(DZLNTST dlaord.f) set(SLINTST schkaa.F schkeq.f schkgb.f schkge.f schkgt.f schklq.f schkpb.f schkpo.f schkps.f schkpp.f - schkpt.f schkq3.f schkql.f schkqr.f schkrq.f + schkpt.f schkq3.f schkqp3rk.f schkql.f schkqr.f schkrq.f schksp.f schksy.f schksy_rook.f schksy_rk.f schksy_aa.f schksy_aa_2stage.f schktb.f schktp.f schktr.f @@ -20,7 +20,7 @@ set(SLINTST schkaa.F serrgt.f serrlq.f serrls.f serrps.f serrql.f serrqp.f serrqr.f serrrq.f serrtr.f serrtz.f - sgbt01.f sgbt02.f sgbt05.f sgelqs.f sgeqls.f sgeqrs.f + sgbt01.f sgbt02.f sgbt05.f sgeqls.f sgerqs.f sget01.f sget02.f sget03.f sget04.f sget06.f sget07.f sgtt01.f sgtt02.f sgtt05.f slaptm.f slarhs.f slatb4.f slatb5.f slattb.f slattp.f @@ -56,7 +56,7 @@ set(CLINTST cchkaa.F cchkhe.f cchkhe_rook.f cchkhe_rk.f cchkhe_aa.f cchkhe_aa_2stage.f cchkhp.f cchklq.f cchkpb.f - cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkql.f + cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkqp3rk.f cchkql.f cchkqr.f cchkrq.f cchksp.f cchksy.f cchksy_rook.f cchksy_rk.f cchksy_aa.f cchksy_aa_2stage.f cchktb.f @@ -70,7 +70,7 @@ set(CLINTST cchkaa.F cerrgt.f cerrlq.f cerrls.f cerrps.f cerrql.f cerrqp.f cerrqr.f cerrrq.f cerrtr.f cerrtz.f - cgbt01.f cgbt02.f cgbt05.f cgelqs.f cgeqls.f cgeqrs.f + cgbt01.f cgbt02.f cgbt05.f cgeqls.f cgerqs.f cget01.f cget02.f cget03.f cget04.f cget07.f cgtt01.f cgtt02.f cgtt05.f chet01.f chet01_rook.f chet01_3.f @@ -110,7 +110,7 @@ endif() set(DLINTST dchkaa.F dchkeq.f dchkgb.f dchkge.f dchkgt.f dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f - dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f + dchkpt.f dchkq3.f dchkqp3rk.f dchkql.f dchkqr.f dchkrq.f dchksp.f dchksy.f dchksy_rook.f dchksy_rk.f dchksy_aa.f dchksy_aa_2stage.f dchktb.f dchktp.f dchktr.f @@ -121,7 +121,7 @@ set(DLINTST dchkaa.F derrgt.f derrlq.f derrls.f derrps.f derrql.f derrqp.f derrqr.f derrrq.f derrtr.f derrtz.f - dgbt01.f dgbt02.f dgbt05.f dgelqs.f dgeqls.f dgeqrs.f + dgbt01.f dgbt02.f dgbt05.f dgeqls.f dgerqs.f dget01.f dget02.f dget03.f dget04.f dget06.f dget07.f dgtt01.f dgtt02.f dgtt05.f dlaptm.f dlarhs.f dlatb4.f dlatb5.f dlattb.f dlattp.f @@ -158,7 +158,7 @@ set(ZLINTST zchkaa.F zchkhe.f zchkhe_rook.f zchkhe_rk.f zchkhe_aa.f zchkhe_aa_2stage.f zchkhp.f zchklq.f zchkpb.f - zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkql.f + zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkqp3rk.f zchkql.f zchkqr.f zchkrq.f zchksp.f zchksy.f zchksy_rook.f zchksy_rk.f zchksy_aa.f zchksy_aa_2stage.f zchktb.f @@ -172,7 +172,7 @@ set(ZLINTST zchkaa.F zerrgt.f zerrlq.f zerrls.f zerrps.f zerrql.f zerrqp.f zerrqr.f zerrrq.f zerrtr.f zerrtz.f - zgbt01.f zgbt02.f zgbt05.f zgelqs.f zgeqls.f zgeqrs.f + zgbt01.f zgbt02.f zgbt05.f zgeqls.f zgerqs.f zget01.f zget02.f zget03.f zget04.f zget07.f zgtt01.f zgtt02.f zgtt05.f zhet01.f zhet01_rook.f zhet01_3.f @@ -239,7 +239,11 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) + target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(${name} omp pthread) + endif() #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 54b26455e..714efa52a 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -45,7 +45,7 @@ DZLNTST = dlaord.o SLINTST = schkaa.o \ schkeq.o schkgb.o schkge.o schkgt.o \ schklq.o schkpb.o schkpo.o schkps.o schkpp.o \ - schkpt.o schkq3.o schkql.o schkqr.o schkrq.o \ + schkpt.o schkq3.o schkqp3rk.o schkql.o schkqr.o schkrq.o \ schksp.o schksy.o schksy_rook.o schksy_rk.o \ schksy_aa.o schksy_aa_2stage.o schktb.o schktp.o schktr.o \ schktz.o \ @@ -55,7 +55,7 @@ SLINTST = schkaa.o \ serrgt.o serrlq.o serrls.o \ serrps.o serrql.o serrqp.o serrqr.o \ serrrq.o serrtr.o serrtz.o \ - sgbt01.o sgbt02.o sgbt05.o sgelqs.o sgeqls.o sgeqrs.o \ + sgbt01.o sgbt02.o sgbt05.o sgeqls.o \ sgerqs.o sget01.o sget02.o \ sget03.o sget04.o sget06.o sget07.o sgtt01.o sgtt02.o \ sgtt05.o slaptm.o slarhs.o slatb4.o slatb5.o slattb.o slattp.o \ @@ -89,7 +89,7 @@ CLINTST = cchkaa.o \ cchkeq.o cchkgb.o cchkge.o cchkgt.o \ cchkhe.o cchkhe_rook.o cchkhe_rk.o \ cchkhe_aa.o cchkhe_aa_2stage.o cchkhp.o cchklq.o cchkpb.o \ - cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkql.o \ + cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkqp3rk.o cchkql.o \ cchkqr.o cchkrq.o cchksp.o cchksy.o cchksy_rook.o cchksy_rk.o \ cchksy_aa.o cchksy_aa_2stage.o cchktb.o \ cchktp.o cchktr.o cchktz.o \ @@ -100,7 +100,7 @@ CLINTST = cchkaa.o \ cerrgt.o cerrlq.o \ cerrls.o cerrps.o cerrql.o cerrqp.o \ cerrqr.o cerrrq.o cerrtr.o cerrtz.o \ - cgbt01.o cgbt02.o cgbt05.o cgelqs.o cgeqls.o cgeqrs.o \ + cgbt01.o cgbt02.o cgbt05.o cgeqls.o \ cgerqs.o cget01.o cget02.o \ cget03.o cget04.o cget07.o cgtt01.o cgtt02.o \ cgtt05.o chet01.o chet01_rook.o chet01_3.o chet01_aa.o \ @@ -137,7 +137,7 @@ endif DLINTST = dchkaa.o \ dchkeq.o dchkgb.o dchkge.o dchkgt.o \ dchklq.o dchkpb.o dchkpo.o dchkps.o dchkpp.o \ - dchkpt.o dchkq3.o dchkql.o dchkqr.o dchkrq.o \ + dchkpt.o dchkq3.o dchkqp3rk.o dchkql.o dchkqr.o dchkrq.o \ dchksp.o dchksy.o dchksy_rook.o dchksy_rk.o \ dchksy_aa.o dchksy_aa_2stage.o dchktb.o dchktp.o dchktr.o \ dchktz.o \ @@ -147,7 +147,7 @@ DLINTST = dchkaa.o \ derrgt.o derrlq.o derrls.o \ derrps.o derrql.o derrqp.o derrqr.o \ derrrq.o derrtr.o derrtz.o \ - dgbt01.o dgbt02.o dgbt05.o dgelqs.o dgeqls.o dgeqrs.o \ + dgbt01.o dgbt02.o dgbt05.o dgeqls.o \ dgerqs.o dget01.o dget02.o \ dget03.o dget04.o dget06.o dget07.o dgtt01.o dgtt02.o \ dgtt05.o dlaptm.o dlarhs.o dlatb4.o dlatb5.o dlattb.o dlattp.o \ @@ -182,7 +182,7 @@ ZLINTST = zchkaa.o \ zchkeq.o zchkgb.o zchkge.o zchkgt.o \ zchkhe.o zchkhe_rook.o zchkhe_rk.o zchkhe_aa.o zchkhe_aa_2stage.o \ zchkhp.o zchklq.o zchkpb.o \ - zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkql.o \ + zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkqp3rk.o zchkql.o \ zchkqr.o zchkrq.o zchksp.o zchksy.o zchksy_rook.o zchksy_rk.o \ zchksy_aa.o zchksy_aa_2stage.o zchktb.o \ zchktp.o zchktr.o zchktz.o \ @@ -192,7 +192,7 @@ ZLINTST = zchkaa.o \ zerrgt.o zerrlq.o \ zerrls.o zerrps.o zerrql.o zerrqp.o \ zerrqr.o zerrrq.o zerrtr.o zerrtz.o \ - zgbt01.o zgbt02.o zgbt05.o zgelqs.o zgeqls.o zgeqrs.o \ + zgbt01.o zgbt02.o zgbt05.o zgeqls.o \ zgerqs.o zget01.o zget02.o \ zget03.o zget04.o zget07.o zgtt01.o zgtt02.o \ zgtt05.o zhet01.o zhet01_rook.o zhet01_3.o zhet01_aa.o \ diff --git a/lapack-netlib/TESTING/LIN/alaerh.f b/lapack-netlib/TESTING/LIN/alaerh.f index 1845888a6..6c8a47f1e 100644 --- a/lapack-netlib/TESTING/LIN/alaerh.f +++ b/lapack-netlib/TESTING/LIN/alaerh.f @@ -797,6 +797,18 @@ WRITE( NOUT, FMT = 9978 ) $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT END IF +* + ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN +* +* xQK: truncated QR factorization with pivoting +* + IF( LSAMEN( 7, SUBNAM( 2: 8 ), 'GEQP3RK' ) ) THEN + WRITE( NOUT, FMT = 9930 ) + $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, KL, N5, IMAT + ELSE IF( LSAMEN( 5, SUBNAM( 2: 6 ), 'LATMS' ) ) THEN + WRITE( NOUT, FMT = 9978 ) + $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT + END IF * ELSE IF( LSAMEN( 2, P2, 'LQ' ) ) THEN * @@ -1147,6 +1159,11 @@ * What we do next * 9949 FORMAT( ' ==> Doing only the condition estimate for this case' ) +* +* SUBNAM, INFO, M, N, NB, IMAT +* + 9930 FORMAT( ' *** Error code from ', A, '=', I5, / ' ==> M =', I5, + $ ', N =', I5, ', NX =', I5, ', NB =', I4, ', type ', I2 ) * RETURN * diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index f0423a23b..8f966c584 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -584,13 +584,27 @@ * * QR decomposition with column pivoting * - WRITE( IOUNIT, FMT = 9986 )PATH + WRITE( IOUNIT, FMT = 8006 )PATH WRITE( IOUNIT, FMT = 9969 ) WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' ) WRITE( IOUNIT, FMT = 9940 )1 WRITE( IOUNIT, FMT = 9939 )2 WRITE( IOUNIT, FMT = 9938 )3 WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) +* + ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN +* +* truncated QR decomposition with column pivoting +* + WRITE( IOUNIT, FMT = 8006 )PATH + WRITE( IOUNIT, FMT = 9871 ) + WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' ) + WRITE( IOUNIT, FMT = 8060 )1 + WRITE( IOUNIT, FMT = 8061 )2 + WRITE( IOUNIT, FMT = 8062 )3 + WRITE( IOUNIT, FMT = 8063 )4 + WRITE( IOUNIT, FMT = 8064 )5 + WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) * ELSE IF( LSAMEN( 2, P2, 'TZ' ) ) THEN * @@ -777,8 +791,10 @@ $ 'triangular-pentagonal matrices' ) 8004 FORMAT( / 1X, A3, ': TS factorization for ', $ 'tall-skinny or short-wide matrices' ) - 8005 FORMAT( / 1X, A3, ': Householder recostruction from TSQR', + 8005 FORMAT( / 1X, A3, ': Householder reconstruction from TSQR', $ ' factorization output ', /,' for tall-skinny matrices.' ) + 8006 FORMAT( / 1X, A3, ': truncated QR factorization', + $ ' with column pivoting' ) * * GE matrix types * @@ -922,6 +938,36 @@ $ / 4X, '3. Geometric distribution', 10X, $ '6. Every second column fixed' ) * +* QK matrix types +* + 9871 FORMAT( 4X, ' 1. Zero matrix', / + $ 4X, ' 2. Random, Diagonal, CNDNUM = 2', / + $ 4X, ' 3. Random, Upper triangular, CNDNUM = 2', / + $ 4X, ' 4. Random, Lower triangular, CNDNUM = 2', / + $ 4X, ' 5. Random, First column is zero, CNDNUM = 2', / + $ 4X, ' 6. Random, Last MINMN column is zero, CNDNUM = 2', / + $ 4X, ' 7. Random, Last N column is zero, CNDNUM = 2', / + $ 4X, ' 8. Random, Middle column in MINMN is zero,', + $ ' CNDNUM = 2', / + $ 4X, ' 9. Random, First half of MINMN columns are zero,', + $ ' CNDNUM = 2', / + $ 4X, '10. Random, Last columns are zero starting from', + $ ' MINMN/2+1, CNDNUM = 2', / + $ 4X, '11. Random, Half MINMN columns in the middle are', + $ ' zero starting from MINMN/2-(MINMN/2)/2+1,' + $ ' CNDNUM = 2', / + $ 4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', / + $ 4X, '13. Random, Even columns are ZERO, CNDNUM = 2', / + $ 4X, '14. Random, CNDNUM = 2', / + $ 4X, '15. Random, CNDNUM = sqrt(0.1/EPS)', / + $ 4X, '16. Random, CNDNUM = 0.1/EPS', / + $ 4X, '17. Random, CNDNUM = 0.1/EPS,', + $ ' one small singular value S(N)=1/CNDNUM', / + $ 4X, '18. Random, CNDNUM = 2, scaled near underflow,', + $ ' NORM = SMALL = SAFMIN', / + $ 4X, '19. Random, CNDNUM = 2, scaled near overflow,', + $ ' NORM = LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )' ) +* * TZ matrix types * 9968 FORMAT( ' Matrix types (2-3 have condition 1/EPS):', / 4X, @@ -1030,9 +1076,8 @@ $ ' * norm(C) * EPS )' ) 9940 FORMAT( 3X, I2, ': norm(svd(A) - svd(R)) / ', $ '( M * norm(svd(R)) * EPS )' ) - 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )' - $ ) - 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' ) + 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )') + 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' ) 9937 FORMAT( 3X, I2, ': norm( A - R*Q ) / ( M * norm(A) * EPS )' $ ) 9935 FORMAT( 3X, I2, ': norm( B - A * X ) / ', @@ -1105,6 +1150,15 @@ 8054 FORMAT(3X,I2,': norm( C*Q - C*Q ) / ( M * norm(C) * EPS )' ) 8055 FORMAT(3X,I2,': norm( C*Q'' - C*Q'' ) / ( M * norm(C) * EPS )') + 8060 FORMAT( 3X, I2, ': 2-norm(svd(A) - svd(R)) / ', + $ '( max(M,N) * 2-norm(svd(R)) * EPS )' ) + 8061 FORMAT( 3X, I2, ': 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A)', + $ ' * EPS )') + 8062 FORMAT( 3X, I2, ': 1-norm( I - Q''*Q ) / ( M * EPS )' ) + 8063 FORMAT( 3X, I2, ': Returns 1.0D+100, if abs(R(K+1,K+1))', + $ ' > abs(R(K,K)), where K=1:KFACT-1' ) + 8064 FORMAT( 3X, I2, ': 1-norm(Q**T * B - Q**T * B ) / ( M * EPS )') + * RETURN * diff --git a/lapack-netlib/TESTING/LIN/alareq.f b/lapack-netlib/TESTING/LIN/alareq.f index db18775eb..3f057fa48 100644 --- a/lapack-netlib/TESTING/LIN/alareq.f +++ b/lapack-netlib/TESTING/LIN/alareq.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F index ec1534ed4..57d95c741 100644 --- a/lapack-netlib/TESTING/LIN/cchkaa.F +++ b/lapack-netlib/TESTING/LIN/cchkaa.F @@ -69,6 +69,7 @@ *> CLQ 8 List types on next line if 0 < NTYPES < 8 *> CQL 8 List types on next line if 0 < NTYPES < 8 *> CQP 6 List types on next line if 0 < NTYPES < 6 +*> ZQK 19 List types on next line if 0 < NTYPES < 19 *> CTZ 3 List types on next line if 0 < NTYPES < 3 *> CLS 6 List types on next line if 0 < NTYPES < 6 *> CEQ @@ -153,12 +154,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL S( 2*NMAX ) - COMPLEX E( NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX, DIMENSION(:), ALLOCATABLE :: E COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. @@ -170,14 +170,14 @@ EXTERNAL ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE, $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP, $ CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS, - $ CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ, - $ CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK, - $ CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, CCHKTZ, - $ CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK, - $ CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB, - $ CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY, - $ CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER, - $ CCHKQRT, CCHKQRTP + $ CCHKPP, CCHKPT, CCHKQ3, CCHKQP3RK, CCHKQL, + $ CCHKQR, CCHKRQ, CCHKSP, CCHKSY, CCHKSY_ROOK, + $ CCHKSY_RK, CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, + $ CCHKTZ, CDRVGB, CDRVGE, CDRVGT, CDRVHE, + $ CDRVHE_ROOK, CDRVHE_RK, CDRVHE_AA, CDRVHP, + $ CDRVLS, CDRVPB, CDRVPO, CDRVPP, CDRVPT, CDRVSP, + $ CDRVSY, CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, + $ ILAVER, CCHKQRT, CCHKQRTP * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -203,6 +203,10 @@ IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. @@ -1109,6 +1113,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ S( 1 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN * @@ -1211,6 +1232,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f index 30a61261f..d79978e55 100644 --- a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f @@ -433,9 +433,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'CHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) + CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -517,7 +517,6 @@ c NT = 1 CALL CLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'CHETRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL CHETRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/cchklq.f b/lapack-netlib/TESTING/LIN/cchklq.f index 54107d047..4499de36f 100644 --- a/lapack-netlib/TESTING/LIN/cchklq.f +++ b/lapack-netlib/TESTING/LIN/cchklq.f @@ -235,7 +235,7 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, CERRLQ, CGELQS, CGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, CERRLQ, CGELS, CGET02, $ CLACPY, CLARHS, CLATB4, CLATMS, CLQT01, CLQT02, $ CLQT03, XLAENV * .. @@ -370,7 +370,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call CGELQS to solve a system +* If M<=N and K=M, call CGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -387,14 +387,20 @@ * CALL CLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'CGELQS' - CALL CGELQS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from CGELQS. +* Reset AF to the original matrix. CGELS +* factors the matrix before solving the system. +* + CALL CLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'CGELS' + CALL CGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from CGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'CGELQS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'CGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/cchkqp3rk.f b/lapack-netlib/TESTING/LIN/cchkqp3rk.f new file mode 100644 index 000000000..79d6add72 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cchkqp3rk.f @@ -0,0 +1,836 @@ +*> \brief \b CCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, RWORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNB, NOUT +* REAL THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ), +* $ NXVAL( * ) +* REAL S( * ), RWORK( * ) +* COMPLEX A( * ), COPYA( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CCHKQP3RK tests CGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is COMPLEX array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is COMPLEX array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is COMPLEX array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is COMPLEX array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is REAL array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension +*> (max(M*max(M,N) + 4*min(M,N) + max(M,N))) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (4*NMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex_lin +* +* ===================================================================== + SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, RWORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + REAL THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + REAL S( * ), RWORK( * ) + COMPLEX A( * ), COPYA( * ), B( * ), COPYB( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + REAL ONE, ZERO, BIGNUM + COMPLEX CONE, CZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0, + $ CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ), + $ BIGNUM = 1.0E+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + REAL ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + REAL RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE + EXTERNAL SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, SLAORD, ICOPY, CAXPY, + $ XLAENV, CGEQP3RK, CLACPY, CLASET, CLATB4, + $ CLATMS, CUNMQR, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, MOD, REAL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT, CUNMQR_LWORK +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Complex precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = SLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with CLATB4 and generate +* M-by-NRHS B matrix with CLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL CLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' + CALL CLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL CLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with CLATMS. +* + CALL CLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' + CALL CLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL SLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL CLASET( 'Full', M, NB_ZERO, CZERO, CZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL CLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL CLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL CSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL CSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + CALL SLAORD( 'Decreasing', MINMNB_GEN, S, 1 ) + + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL CLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL CLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL CLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOl = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute CGEQP3RK factorization of A. +* + SRNAMT = 'CGEQP3RK' + CALL CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, RWORK, IWORK( 2*N+1 ), + $ INFO ) +* +* Check error code from CGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'CGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* + IF( KFACT.EQ.MINMN ) THEN +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + RESULT( 1 ) = CQRT12( M, N, A, LDA, S, WORK, + $ LWORK , RWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF + +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = CQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = CQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 +* + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL CUNMQR( 'Left', 'Conjugate transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL CAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ CLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of CCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/cchkqr.f b/lapack-netlib/TESTING/LIN/cchkqr.f index 7ea178eaf..4fa7413f9 100644 --- a/lapack-netlib/TESTING/LIN/cchkqr.f +++ b/lapack-netlib/TESTING/LIN/cchkqr.f @@ -244,7 +244,7 @@ EXTERNAL CGENND * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, CERRQR, CGEQRS, CGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, CERRQR, CGELS, CGET02, $ CLACPY, CLARHS, CLATB4, CLATMS, CQRT01, $ CQRT01P, CQRT02, CQRT03, XLAENV * .. @@ -371,7 +371,7 @@ IF( .NOT. CGENND( M, N, AF, LDA ) ) $ RESULT( 9 ) = 2*THRESH NT = NT + 1 - ELSE IF( M.GE.N ) THEN + ELSE IF( M.GE.N ) THEN * * Test CUNGQR, using factorization * returned by CQRT01 @@ -388,7 +388,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call CGEQRS to solve a system +* If M>=N and K=N, call CGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -405,14 +405,20 @@ * CALL CLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'CGEQRS' - CALL CGEQRS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from CGEQRS. +* Reset AF to the original matrix. CGELS +* factors the matrix before solving the system. +* + CALL CLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'CGELS' + CALL CGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from CGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'CGEQRS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'CGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/cchktp.f b/lapack-netlib/TESTING/LIN/cchktp.f index 18242ff54..e14f1062e 100644 --- a/lapack-netlib/TESTING/LIN/cchktp.f +++ b/lapack-netlib/TESTING/LIN/cchktp.f @@ -87,7 +87,7 @@ *> \verbatim *> NMAX is INTEGER *> The leading dimension of the work arrays. NMAX >= the -*> maximumm value of N in NVAL. +*> maximum value of N in NVAL. *> \endverbatim *> *> \param[out] AP diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index 4b09361d8..2953a2bd5 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -201,7 +201,8 @@ * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ), SCALE3( 2 ) + REAL RESULT( NTESTS ), RWORK2( 2*NMAX ), + $ SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -542,10 +543,10 @@ SRNAMT = 'CLATRS3' CALL CCOPY( N, X, 1, B, 1 ) CALL CCOPY( N, X, 1, B( N+1 ), 1 ) - CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CSSCAL( N, BIGNUM, B( N+1 ), 1 ) CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, - $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, - $ INFO ) + $ B, MAX(1, N), SCALE3, RWORK, RWORK2, + $ 2*NMAX, INFO ) * * Check error code from CLATRS3. * diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f index 51cef512d..83e8a17b0 100644 --- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using CHESV_AA. * SRNAMT = 'CHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) CALL CHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/cerrhe.f b/lapack-netlib/TESTING/LIN/cerrhe.f index d0c5cf6b5..1798aed57 100644 --- a/lapack-netlib/TESTING/LIN/cerrhe.f +++ b/lapack-netlib/TESTING/LIN/cerrhe.f @@ -133,7 +133,7 @@ IF( LSAMEN( 2, C2, 'HE' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite matrix with patrial +* of a Hermitian indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CHETRF @@ -576,7 +576,7 @@ CALL CHKXER( 'CHETRS_AA_STAGE', INFOT, NOUT, LERR, OK ) * * Test error exits of the routines that use factorization -* of a Hermitian indefinite packed matrix with patrial +* of a Hermitian indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/cerrhex.f b/lapack-netlib/TESTING/LIN/cerrhex.f index b6c889798..83b3a92ad 100644 --- a/lapack-netlib/TESTING/LIN/cerrhex.f +++ b/lapack-netlib/TESTING/LIN/cerrhex.f @@ -137,7 +137,7 @@ IF( LSAMEN( 2, C2, 'HE' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite matrix with patrial +* of a Hermitian indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CHETRF @@ -523,7 +523,7 @@ ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite packed matrix with patrial +* of a Hermitian indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CHPTRF diff --git a/lapack-netlib/TESTING/LIN/cerrlq.f b/lapack-netlib/TESTING/LIN/cerrlq.f index 1036835b4..495adac0d 100644 --- a/lapack-netlib/TESTING/LIN/cerrlq.f +++ b/lapack-netlib/TESTING/LIN/cerrlq.f @@ -76,7 +76,7 @@ $ W( NMAX ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL ALAESM, CGELQ2, CGELQF, CGELQS, CHKXER, CUNGL2, + EXTERNAL ALAESM, CGELQ2, CGELQF, CHKXER, CUNGL2, $ CUNGLQ, CUNML2, CUNMLQ * .. * .. Scalars in Common .. @@ -140,31 +140,6 @@ CALL CGELQ2( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'CGELQ2', INFOT, NOUT, LERR, OK ) * -* CGELQS -* - SRNAMT = 'CGELQS' - INFOT = 1 - CALL CGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL CGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL CGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL CGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL CGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL CGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL CGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) -* * CUNGLQ * SRNAMT = 'CUNGLQ' diff --git a/lapack-netlib/TESTING/LIN/cerrqr.f b/lapack-netlib/TESTING/LIN/cerrqr.f index 21cf22936..30ce001eb 100644 --- a/lapack-netlib/TESTING/LIN/cerrqr.f +++ b/lapack-netlib/TESTING/LIN/cerrqr.f @@ -77,7 +77,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAESM, CGEQR2, CGEQR2P, CGEQRF, CGEQRFP, - $ CGEQRS, CHKXER, CUNG2R, CUNGQR, CUNM2R, + $ CHKXER, CUNG2R, CUNGQR, CUNM2R, $ CUNMQR * .. * .. Scalars in Common .. @@ -170,31 +170,6 @@ CALL CGEQR2P( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'CGEQR2P', INFOT, NOUT, LERR, OK ) * -* CGEQRS -* - SRNAMT = 'CGEQRS' - INFOT = 1 - CALL CGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL CGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL CGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL CGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL CGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL CGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL CGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) -* * CUNGQR * SRNAMT = 'CUNGQR' diff --git a/lapack-netlib/TESTING/LIN/cerrsy.f b/lapack-netlib/TESTING/LIN/cerrsy.f index a236d2d18..6d284be98 100644 --- a/lapack-netlib/TESTING/LIN/cerrsy.f +++ b/lapack-netlib/TESTING/LIN/cerrsy.f @@ -130,7 +130,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CSYTRF @@ -469,7 +469,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CSPTRF diff --git a/lapack-netlib/TESTING/LIN/cerrsyx.f b/lapack-netlib/TESTING/LIN/cerrsyx.f index 34972668e..c8d269c84 100644 --- a/lapack-netlib/TESTING/LIN/cerrsyx.f +++ b/lapack-netlib/TESTING/LIN/cerrsyx.f @@ -135,7 +135,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CSYTRF @@ -521,7 +521,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CSPTRF diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f index 9ba784f62..ab83357f8 100644 --- a/lapack-netlib/TESTING/LIN/cerrtr.f +++ b/lapack-netlib/TESTING/LIN/cerrtr.f @@ -70,7 +70,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - REAL RCOND, SCALE + REAL RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. REAL R1( NMAX ), R2( NMAX ), RW( NMAX ) @@ -245,40 +245,40 @@ * SRNAMT = 'CLATRS3' INFOT = 1 - CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 0, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 0, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) * * Test error exits for the packed triangular routines. diff --git a/lapack-netlib/TESTING/LIN/cerrvx.f b/lapack-netlib/TESTING/LIN/cerrvx.f index 701abd161..548d09d0c 100644 --- a/lapack-netlib/TESTING/LIN/cerrvx.f +++ b/lapack-netlib/TESTING/LIN/cerrvx.f @@ -87,7 +87,7 @@ * .. * .. External Subroutines .. EXTERNAL CGBSV, CGBSVX, CGESV, CGESVX, CGTSV, CGTSVX, - $ CHESV, CHESV_RK ,CHESV_ROOK, CHESVX, CHKXER, + $ CHESV, CHESV_RK, CHESV_ROOK, CHESVX, CHKXER, $ CHPSV, CHPSVX, CPBSV, CPBSVX, CPOSV, CPOSVX, $ CPPSV, CPPSVX, CPTSV, CPTSVX, CSPSV, CSPSVX, $ CSYSV, CSYSV_AA, CSYSV_RK, CSYSV_ROOK, @@ -651,6 +651,9 @@ INFOT = 3 CALL CHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK ) @@ -710,9 +713,15 @@ INFOT = 3 CALL CHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN * @@ -733,16 +742,44 @@ CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* CSYSV_AASEN +* + SRNAMT = 'CSYSV_AA' + INFOT = 1 + CALL CSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN * @@ -763,14 +800,18 @@ CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) * @@ -834,6 +875,9 @@ INFOT = 3 CALL CSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK ) @@ -898,6 +942,9 @@ INFOT = 3 CALL CSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK ) diff --git a/lapack-netlib/TESTING/LIN/cgtt01.f b/lapack-netlib/TESTING/LIN/cgtt01.f index e504515c5..64dc5f8fa 100644 --- a/lapack-netlib/TESTING/LIN/cgtt01.f +++ b/lapack-netlib/TESTING/LIN/cgtt01.f @@ -39,7 +39,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/cgtt02.f b/lapack-netlib/TESTING/LIN/cgtt02.f index 702e66eed..8b951acd5 100644 --- a/lapack-netlib/TESTING/LIN/cgtt02.f +++ b/lapack-netlib/TESTING/LIN/cgtt02.f @@ -40,14 +40,14 @@ *> \verbatim *> TRANS is CHARACTER *> Specifies the form of the residual. -*> = 'N': B - A * X (No transpose) +*> = 'N': B - A * X (No transpose) *> = 'T': B - A**T * X (Transpose) *> = 'C': B - A**H * X (Conjugate transpose) *> \endverbatim *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/chet01_3.f b/lapack-netlib/TESTING/LIN/chet01_3.f index 58104a357..7e07b5790 100644 --- a/lapack-netlib/TESTING/LIN/chet01_3.f +++ b/lapack-netlib/TESTING/LIN/chet01_3.f @@ -188,7 +188,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/clatb4.f b/lapack-netlib/TESTING/LIN/clatb4.f index eeb0f03a9..233a8631a 100644 --- a/lapack-netlib/TESTING/LIN/clatb4.f +++ b/lapack-netlib/TESTING/LIN/clatb4.f @@ -154,9 +154,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL SLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -174,11 +171,6 @@ BADC1 = SQRT( BADC2 ) SMALL = SLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL SLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -233,6 +225,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -517,17 +613,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/clqt02.f b/lapack-netlib/TESTING/LIN/clqt02.f index 24eb05399..20297f6cc 100644 --- a/lapack-netlib/TESTING/LIN/clqt02.f +++ b/lapack-netlib/TESTING/LIN/clqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> CLQT02 tests CUNGLQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the LQ factorization of an m-by-n matrix A, CLQT02 generates diff --git a/lapack-netlib/TESTING/LIN/cptt01.f b/lapack-netlib/TESTING/LIN/cptt01.f index e4520ec3d..3b117ad28 100644 --- a/lapack-netlib/TESTING/LIN/cptt01.f +++ b/lapack-netlib/TESTING/LIN/cptt01.f @@ -36,7 +36,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/cptt02.f b/lapack-netlib/TESTING/LIN/cptt02.f index da4f0e854..ffaef89bd 100644 --- a/lapack-netlib/TESTING/LIN/cptt02.f +++ b/lapack-netlib/TESTING/LIN/cptt02.f @@ -46,7 +46,7 @@ *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/cqlt02.f b/lapack-netlib/TESTING/LIN/cqlt02.f index fc4685aa5..53080cd17 100644 --- a/lapack-netlib/TESTING/LIN/cqlt02.f +++ b/lapack-netlib/TESTING/LIN/cqlt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> CQLT02 tests CUNGQL, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QL factorization of an m-by-n matrix A, CQLT02 generates diff --git a/lapack-netlib/TESTING/LIN/cqpt01.f b/lapack-netlib/TESTING/LIN/cqpt01.f index 79fc2dc66..149c5bb7c 100644 --- a/lapack-netlib/TESTING/LIN/cqpt01.f +++ b/lapack-netlib/TESTING/LIN/cqpt01.f @@ -33,7 +33,8 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,28 @@ * NORMA = CLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL CCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL CUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL CAXPY( M, CMPLX( -ONE ), A( 1, JPVT( J ) ), 1, $ WORK( ( J-1 )*M+1 ), 1 ) - 50 CONTINUE + END DO * CQPT01 = CLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/cqrt02.f b/lapack-netlib/TESTING/LIN/cqrt02.f index 62f176cd8..db22496cf 100644 --- a/lapack-netlib/TESTING/LIN/cqrt02.f +++ b/lapack-netlib/TESTING/LIN/cqrt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> CQRT02 tests CUNGQR, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QR factorization of an m-by-n matrix A, CQRT02 generates diff --git a/lapack-netlib/TESTING/LIN/cqrt11.f b/lapack-netlib/TESTING/LIN/cqrt11.f index 494d5e9cd..a52084973 100644 --- a/lapack-netlib/TESTING/LIN/cqrt11.f +++ b/lapack-netlib/TESTING/LIN/cqrt11.f @@ -157,9 +157,9 @@ CALL CUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU, $ WORK, M, WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * CQRT11 = CLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/cqrt12.f b/lapack-netlib/TESTING/LIN/cqrt12.f index 4c29423ae..0df2d833b 100644 --- a/lapack-netlib/TESTING/LIN/cqrt12.f +++ b/lapack-netlib/TESTING/LIN/cqrt12.f @@ -28,7 +28,7 @@ *> CQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues -s ||/( ||s||*eps*max(M,N) ) *> \endverbatim * * Arguments: @@ -125,8 +125,8 @@ EXTERNAL CLANGE, SASUM, SLAMCH, SNRM2 * .. * .. External Subroutines .. - EXTERNAL CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLABAD, - $ SLASCL, XERBLA + EXTERNAL CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLASCL, + $ XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL @@ -153,17 +153,16 @@ * Copy upper triangle of A into work * CALL CLASET( 'Full', M, N, CMPLX( ZERO ), CMPLX( ZERO ), WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -207,9 +206,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN RWORK( I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work diff --git a/lapack-netlib/TESTING/LIN/crqt02.f b/lapack-netlib/TESTING/LIN/crqt02.f index 8625c2f91..81c805dc4 100644 --- a/lapack-netlib/TESTING/LIN/crqt02.f +++ b/lapack-netlib/TESTING/LIN/crqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> CRQT02 tests CUNGRQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the RQ factorization of an m-by-n matrix A, CRQT02 generates diff --git a/lapack-netlib/TESTING/LIN/crzt01.f b/lapack-netlib/TESTING/LIN/crzt01.f index 6a75a8b31..94fdcb4a9 100644 --- a/lapack-netlib/TESTING/LIN/crzt01.f +++ b/lapack-netlib/TESTING/LIN/crzt01.f @@ -158,7 +158,7 @@ * * R = R * P(1) * ... *P(m) * - CALL CUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU, + CALL CUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU, $ WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO ) * * R = R - A diff --git a/lapack-netlib/TESTING/LIN/csyt01_3.f b/lapack-netlib/TESTING/LIN/csyt01_3.f index f0b316cef..879749a10 100644 --- a/lapack-netlib/TESTING/LIN/csyt01_3.f +++ b/lapack-netlib/TESTING/LIN/csyt01_3.f @@ -188,7 +188,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F index ef9d7808c..6582cac13 100644 --- a/lapack-netlib/TESTING/LIN/dchkaa.F +++ b/lapack-netlib/TESTING/LIN/dchkaa.F @@ -63,6 +63,7 @@ *> DLQ 8 List types on next line if 0 < NTYPES < 8 *> DQL 8 List types on next line if 0 < NTYPES < 8 *> DQP 6 List types on next line if 0 < NTYPES < 6 +*> DQK 19 List types on next line if 0 < NTYPES < 19 *> DTZ 3 List types on next line if 0 < NTYPES < 3 *> DLS 6 List types on next line if 0 < NTYPES < 6 *> DEQ @@ -149,12 +150,12 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION E( NMAX ), S( 2*NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK - DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: E + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -164,13 +165,13 @@ * .. External Subroutines .. EXTERNAL ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ, $ DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP, - $ DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP, - $ DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, - $ DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, - $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT, - $ DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, - $ DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP, - $ DCHKLQT,DCHKTSQR + $ DCHKPT, DCHKQ3, DCHKQP3RK, DCHKQL, DCHKQR, + $ DCHKRQ, DCHKSP, DCHKSY, DCHKSY_ROOK, DCHKSY_RK, + $ DCHKSY_AA, DCHKTB, DCHKTP, DCHKTR, DCHKTZ, + $ DDRVGB, DDRVGE, DDRVGT, DDRVLS, DDRVPB, DDRVPO, + $ DDRVPP, DDRVPT, DDRVSP, DDRVSY, DDRVSY_ROOK, + $ DDRVSY_RK, DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, + $ DCHKQRTP, DCHKLQT,DCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -197,6 +198,10 @@ IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * @@ -919,9 +924,26 @@ CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) * IF( TSTCHK ) THEN - CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, IWORK, NOUT ) + CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, + $ NXVAL, THRESH, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 3 ), WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), B( 1, 4 ), + $ WORK, IWORK, NOUT ) ELSE WRITE( NOUT, FMT = 9989 )PATH END IF @@ -1054,6 +1076,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/dchklq.f b/lapack-netlib/TESTING/LIN/dchklq.f index 70af41fe0..a207e0056 100644 --- a/lapack-netlib/TESTING/LIN/dchklq.f +++ b/lapack-netlib/TESTING/LIN/dchklq.f @@ -235,7 +235,7 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, DERRLQ, DGELQS, DGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, DERRLQ, DGELS, DGET02, $ DLACPY, DLARHS, DLATB4, DLATMS, DLQT01, DLQT02, $ DLQT03, XLAENV * .. @@ -373,7 +373,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call DGELQS to solve a system +* If M<=N and K=M, call DGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -390,14 +390,20 @@ * CALL DLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'DGELQS' - CALL DGELQS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from DGELQS. +* Reset AF to the original matrix. DGELS +* factors the matrix before solving the system. +* + CALL DLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'DGELS' + CALL DGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from DGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'DGELQS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'DGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/dchkq3.f b/lapack-netlib/TESTING/LIN/dchkq3.f index 1fdf07252..494008fa8 100644 --- a/lapack-netlib/TESTING/LIN/dchkq3.f +++ b/lapack-netlib/TESTING/LIN/dchkq3.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> DCHKQ3 tests DGEQP3. +*> DCHKQ3 tests DGEQP3. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/LIN/dchkqp3rk.f b/lapack-netlib/TESTING/LIN/dchkqp3rk.f new file mode 100644 index 000000000..434d2067e --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dchkqp3rk.f @@ -0,0 +1,832 @@ +*> \brief \b DCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNS, NNB, NOUT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ), +* $ NVAL( * ), NXVAL( * ) +* DOUBLE PRECISION A( * ), COPYA( * ), B( * ), COPYB( * ), +* $ S( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DCHKQP3RK tests DGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is DOUBLE PRECISION array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is DOUBLE PRECISION array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is DOUBLE PRECISION array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension +*> (MMAX*NMAX + 4*NMAX + MMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup double_lin +* +* ===================================================================== + SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + DOUBLE PRECISION A( * ), COPYA( * ), B( * ), COPYB( * ), + $ S( * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + DOUBLE PRECISION ONE, ZERO, BIGNUM + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0, + $ BIGNUM = 1.0D+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + DOUBLE PRECISION ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + DOUBLE PRECISION RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE, + $ DLAPY2 + EXTERNAL DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, DAXPY, DGEQP3RK, + $ DLACPY, DLAORD, DLASET, DLATB4, DLATMS, + $ DORMQR, DSWAP, ICOPY, XLAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, MIN, MOD +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Double precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = DLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with DLATB4 and generate +* M-by-NRHS B matrix with DLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL DLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' + CALL DLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) + + +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL DLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with DLATMS. +* + CALL DLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' + CALL DLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL DLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL DLASET( 'Full', M, NB_ZERO, ZERO, ZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL DLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL DLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL DSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL DSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL DLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL DLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL DLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOL = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute DGEQP3RK factorization of A. +* + SRNAMT = 'DGEQP3RK' + CALL DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, IWORK( 2*N+1 ), INFO ) +* +* Check error code from DGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'DGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + IF( KFACT.EQ.MINMN ) THEN +* + RESULT( 1 ) = DQRT12( M, N, A, LDA, S, WORK, + $ LWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF +* +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = DQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = DQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 + + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL DORMQR( 'Left', 'Transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL DAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ DLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of DCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/dchkqr.f b/lapack-netlib/TESTING/LIN/dchkqr.f index c729e61a9..8188d7a00 100644 --- a/lapack-netlib/TESTING/LIN/dchkqr.f +++ b/lapack-netlib/TESTING/LIN/dchkqr.f @@ -244,7 +244,7 @@ EXTERNAL DGENND * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, DERRQR, DGEQRS, DGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, DERRQR, DGELS, DGET02, $ DLACPY, DLARHS, DLATB4, DLATMS, DQRT01, $ DQRT01P, DQRT02, DQRT03, XLAENV * .. @@ -372,7 +372,7 @@ IF( .NOT. DGENND( M, N, AF, LDA ) ) $ RESULT( 9 ) = 2*THRESH NT = NT + 1 - ELSE IF( M.GE.N ) THEN + ELSE IF( M.GE.N ) THEN * * Test DORGQR, using factorization * returned by DQRT01 @@ -389,7 +389,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call DGEQRS to solve a system +* If M>=N and K=N, call DGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -406,14 +406,20 @@ * CALL DLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'DGEQRS' - CALL DGEQRS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from DGEQRS. +* Reset AF. DGELS overwrites the matrix with +* its factorization. +* + CALL DLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'DGELS' + CALL DGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from DGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'DGEQRS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'DGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f index bc4e77a5a..1940351a4 100644 --- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f @@ -421,9 +421,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'DSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -503,7 +503,6 @@ c NT = 1 CALL DLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'DSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL DSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/dchktp.f b/lapack-netlib/TESTING/LIN/dchktp.f index 9af6150ca..6db5b1376 100644 --- a/lapack-netlib/TESTING/LIN/dchktp.f +++ b/lapack-netlib/TESTING/LIN/dchktp.f @@ -86,7 +86,7 @@ *> \verbatim *> NMAX is INTEGER *> The leading dimension of the work arrays. NMAX >= the -*> maximumm value of N in NVAL. +*> maximum value of N in NVAL. *> \endverbatim *> *> \param[out] AP diff --git a/lapack-netlib/TESTING/LIN/ddrvab.f b/lapack-netlib/TESTING/LIN/ddrvab.f index 9110d8334..5fd32b951 100644 --- a/lapack-netlib/TESTING/LIN/ddrvab.f +++ b/lapack-netlib/TESTING/LIN/ddrvab.f @@ -346,7 +346,7 @@ CALL DGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK, $ LDA, RWORK, RESULT( 1 ) ) * -* Check if the test passes the tesing. +* Check if the test passes the testing. * Print information about the tests that did not * pass the testing. * diff --git a/lapack-netlib/TESTING/LIN/ddrvac.f b/lapack-netlib/TESTING/LIN/ddrvac.f index bd463cee4..3ecbc6a23 100644 --- a/lapack-netlib/TESTING/LIN/ddrvac.f +++ b/lapack-netlib/TESTING/LIN/ddrvac.f @@ -365,7 +365,7 @@ CALL DPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK, $ LDA, RWORK, RESULT( 1 ) ) * -* Check if the test passes the tesing. +* Check if the test passes the testing. * Print information about the tests that did not * pass the testing. * diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f index 91c9e8e9a..d04106ae3 100644 --- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using DSYSV_AA. * SRNAMT = 'DSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/derrlq.f b/lapack-netlib/TESTING/LIN/derrlq.f index d3cfcddd0..76ff4709e 100644 --- a/lapack-netlib/TESTING/LIN/derrlq.f +++ b/lapack-netlib/TESTING/LIN/derrlq.f @@ -76,7 +76,7 @@ $ W( NMAX ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DGELQ2, DGELQF, DGELQS, DORGL2, + EXTERNAL ALAESM, CHKXER, DGELQ2, DGELQF, DORGL2, $ DORGLQ, DORML2, DORMLQ * .. * .. Scalars in Common .. @@ -140,31 +140,6 @@ CALL DGELQ2( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'DGELQ2', INFOT, NOUT, LERR, OK ) * -* DGELQS -* - SRNAMT = 'DGELQS' - INFOT = 1 - CALL DGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL DGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL DGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL DGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL DGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) -* * DORGLQ * SRNAMT = 'DORGLQ' diff --git a/lapack-netlib/TESTING/LIN/derrqr.f b/lapack-netlib/TESTING/LIN/derrqr.f index 03155b133..f7e850b80 100644 --- a/lapack-netlib/TESTING/LIN/derrqr.f +++ b/lapack-netlib/TESTING/LIN/derrqr.f @@ -77,7 +77,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, DGEQR2, DGEQR2P, DGEQRF, - $ DGEQRFP, DGEQRS, DORG2R, DORGQR, DORM2R, + $ DGEQRFP, DORG2R, DORGQR, DORM2R, $ DORMQR * .. * .. Scalars in Common .. @@ -170,31 +170,6 @@ CALL DGEQR2P( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'DGEQR2P', INFOT, NOUT, LERR, OK ) * -* DGEQRS -* - SRNAMT = 'DGEQRS' - INFOT = 1 - CALL DGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL DGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL DGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL DGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL DGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) -* * DORGQR * SRNAMT = 'DORGQR' diff --git a/lapack-netlib/TESTING/LIN/derrsy.f b/lapack-netlib/TESTING/LIN/derrsy.f index af15a4b8e..eb08d4c7b 100644 --- a/lapack-netlib/TESTING/LIN/derrsy.f +++ b/lapack-netlib/TESTING/LIN/derrsy.f @@ -133,7 +133,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) pivoting. * * DSYTRF @@ -581,7 +581,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * DSPTRF diff --git a/lapack-netlib/TESTING/LIN/derrsyx.f b/lapack-netlib/TESTING/LIN/derrsyx.f index c2d14caab..495302158 100644 --- a/lapack-netlib/TESTING/LIN/derrsyx.f +++ b/lapack-netlib/TESTING/LIN/derrsyx.f @@ -138,7 +138,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) pivoting. * * DSYTRF @@ -528,7 +528,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * DSPTRF diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f index d0580497d..878d9070c 100644 --- a/lapack-netlib/TESTING/LIN/derrtr.f +++ b/lapack-netlib/TESTING/LIN/derrtr.f @@ -71,7 +71,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - DOUBLE PRECISION RCOND, SCALE + DOUBLE PRECISION RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. INTEGER IW( NMAX ) @@ -250,40 +250,40 @@ * SRNAMT = 'DLATRS3' INFOT = 1 - CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 0, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 0, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/derrvx.f b/lapack-netlib/TESTING/LIN/derrvx.f index 447b00bc4..f2d29f7a3 100644 --- a/lapack-netlib/TESTING/LIN/derrvx.f +++ b/lapack-netlib/TESTING/LIN/derrvx.f @@ -699,21 +699,27 @@ * ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN * -* DSYSV_AA -* - SRNAMT = 'DSYSV_AA' - INFOT = 1 - CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) +* DSYSV_AASEN +* + SRNAMT = 'DSYSV_AA' + INFOT = 1 + CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN * @@ -734,14 +740,18 @@ CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) * diff --git a/lapack-netlib/TESTING/LIN/dgtt01.f b/lapack-netlib/TESTING/LIN/dgtt01.f index 3b5ff9e4c..6fd754db2 100644 --- a/lapack-netlib/TESTING/LIN/dgtt01.f +++ b/lapack-netlib/TESTING/LIN/dgtt01.f @@ -39,7 +39,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/dgtt02.f b/lapack-netlib/TESTING/LIN/dgtt02.f index b3268b138..4fecff86e 100644 --- a/lapack-netlib/TESTING/LIN/dgtt02.f +++ b/lapack-netlib/TESTING/LIN/dgtt02.f @@ -41,14 +41,14 @@ *> \verbatim *> TRANS is CHARACTER *> Specifies the form of the residual. -*> = 'N': B - A * X (No transpose) +*> = 'N': B - A * X (No transpose) *> = 'T': B - A**T * X (Transpose) *> = 'C': B - A**H * X (Conjugate transpose = Transpose) *> \endverbatim *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/dlatb4.f b/lapack-netlib/TESTING/LIN/dlatb4.f index 8825d13e7..f3bccd45b 100644 --- a/lapack-netlib/TESTING/LIN/dlatb4.f +++ b/lapack-netlib/TESTING/LIN/dlatb4.f @@ -133,7 +133,7 @@ * * .. Parameters .. DOUBLE PRECISION SHRINK, TENTH - PARAMETER ( SHRINK = 0.25D0, TENTH = 0.1D+0 ) + PARAMETER ( SHRINK = 0.25D+0, TENTH = 0.1D+0 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION TWO @@ -153,9 +153,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL DLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -173,11 +170,6 @@ BADC1 = SQRT( BADC2 ) SMALL = DLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL DLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -232,6 +224,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -518,17 +614,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/dlqt02.f b/lapack-netlib/TESTING/LIN/dlqt02.f index 158cd0b79..672d82bbc 100644 --- a/lapack-netlib/TESTING/LIN/dlqt02.f +++ b/lapack-netlib/TESTING/LIN/dlqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> DLQT02 tests DORGLQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the LQ factorization of an m-by-n matrix A, DLQT02 generates diff --git a/lapack-netlib/TESTING/LIN/dptt01.f b/lapack-netlib/TESTING/LIN/dptt01.f index 2c6c440ff..9e9c6bd85 100644 --- a/lapack-netlib/TESTING/LIN/dptt01.f +++ b/lapack-netlib/TESTING/LIN/dptt01.f @@ -35,7 +35,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/dptt02.f b/lapack-netlib/TESTING/LIN/dptt02.f index a4802c696..e045b8779 100644 --- a/lapack-netlib/TESTING/LIN/dptt02.f +++ b/lapack-netlib/TESTING/LIN/dptt02.f @@ -35,7 +35,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/dqlt02.f b/lapack-netlib/TESTING/LIN/dqlt02.f index 950cfe67b..7799e0858 100644 --- a/lapack-netlib/TESTING/LIN/dqlt02.f +++ b/lapack-netlib/TESTING/LIN/dqlt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> DQLT02 tests DORGQL, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QL factorization of an m-by-n matrix A, DQLT02 generates diff --git a/lapack-netlib/TESTING/LIN/dqpt01.f b/lapack-netlib/TESTING/LIN/dqpt01.f index 8efbdc774..af3f5dd36 100644 --- a/lapack-netlib/TESTING/LIN/dqpt01.f +++ b/lapack-netlib/TESTING/LIN/dqpt01.f @@ -28,12 +28,13 @@ *> *> DQPT01 tests the QR-factorization with pivoting of a matrix A. The *> array AF contains the (possibly partial) QR-factorization of A, where -*> the upper triangle of AF(1:k,1:k) is a partial triangular factor, -*> the entries below the diagonal in the first k columns are the +*> the upper triangle of AF(1:K,1:K) is a partial triangular factor, +*> the entries below the diagonal in the first K columns are the *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ), +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,41 @@ * NORMA = DLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K +* +* Copy the upper triangular part of the factor R stored +* in AF(1:K,1:K) into the work array WORK. +* + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO +* +* Zero out the elements below the diagonal in the work array. +* + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO +* +* Copy columns (K+1,N) from AF into the work array WORK. +* AF(1:K,K+1:N) contains the rectangular block of the upper trapezoidal +* factor R, AF(K+1:M,K+1:N) contains the partially updated residual +* matrix of R. +* + DO J = K + 1, N CALL DCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL DORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * -* Compare i-th column of QR and jpvt(i)-th column of A +* Compare J-th column of QR and JPVT(J)-th column of A. * CALL DAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ), $ 1 ) - 50 CONTINUE + END DO * DQPT01 = DLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/dqrt02.f b/lapack-netlib/TESTING/LIN/dqrt02.f index d0e4349d1..d41c7b324 100644 --- a/lapack-netlib/TESTING/LIN/dqrt02.f +++ b/lapack-netlib/TESTING/LIN/dqrt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> DQRT02 tests DORGQR, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QR factorization of an m-by-n matrix A, DQRT02 generates diff --git a/lapack-netlib/TESTING/LIN/dqrt11.f b/lapack-netlib/TESTING/LIN/dqrt11.f index 33c7fab37..38bbeb822 100644 --- a/lapack-netlib/TESTING/LIN/dqrt11.f +++ b/lapack-netlib/TESTING/LIN/dqrt11.f @@ -157,9 +157,9 @@ CALL DORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M, $ WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * DQRT11 = DLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/dqrt12.f b/lapack-netlib/TESTING/LIN/dqrt12.f index 278e01bf0..b8a124c59 100644 --- a/lapack-netlib/TESTING/LIN/dqrt12.f +++ b/lapack-netlib/TESTING/LIN/dqrt12.f @@ -26,7 +26,7 @@ *> DQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s ||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -113,8 +113,7 @@ EXTERNAL DASUM, DLAMCH, DLANGE, DNRM2 * .. * .. External Subroutines .. - EXTERNAL DAXPY, DBDSQR, DGEBD2, DLABAD, DLASCL, DLASET, - $ XERBLA + EXTERNAL DAXPY, DBDSQR, DGEBD2, DLASCL, DLASET, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN @@ -145,17 +144,16 @@ * Copy upper triangle of A into work * CALL DLASET( 'Full', M, N, ZERO, ZERO, WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -199,16 +197,18 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN WORK( M*N+I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work * CALL DAXPY( MN, -ONE, S, 1, WORK( M*N+1 ), 1 ) +* DQRT12 = DASUM( MN, WORK( M*N+1 ), 1 ) / - $ ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) ) + $ ( DLAMCH('Epsilon') * DBLE( MAX( M, N ) ) ) +* IF( NRMSVL.NE.ZERO ) $ DQRT12 = DQRT12 / NRMSVL * diff --git a/lapack-netlib/TESTING/LIN/drqt02.f b/lapack-netlib/TESTING/LIN/drqt02.f index 7fbb6a6d4..faf639109 100644 --- a/lapack-netlib/TESTING/LIN/drqt02.f +++ b/lapack-netlib/TESTING/LIN/drqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> DRQT02 tests DORGRQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the RQ factorization of an m-by-n matrix A, DRQT02 generates diff --git a/lapack-netlib/TESTING/LIN/drzt01.f b/lapack-netlib/TESTING/LIN/drzt01.f index 8e969aba7..7a88a2c20 100644 --- a/lapack-netlib/TESTING/LIN/drzt01.f +++ b/lapack-netlib/TESTING/LIN/drzt01.f @@ -158,7 +158,7 @@ * * R = R * P(1) * ... *P(m) * - CALL DORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU, + CALL DORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU, $ WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO ) * * R = R - A diff --git a/lapack-netlib/TESTING/LIN/dsyt01_3.f b/lapack-netlib/TESTING/LIN/dsyt01_3.f index 2a7d9d142..060a9caed 100644 --- a/lapack-netlib/TESTING/LIN/dsyt01_3.f +++ b/lapack-netlib/TESTING/LIN/dsyt01_3.f @@ -183,7 +183,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL DSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F index a5b826d06..036b13924 100644 --- a/lapack-netlib/TESTING/LIN/schkaa.F +++ b/lapack-netlib/TESTING/LIN/schkaa.F @@ -63,6 +63,7 @@ *> SLQ 8 List types on next line if 0 < NTYPES < 8 *> SQL 8 List types on next line if 0 < NTYPES < 8 *> SQP 6 List types on next line if 0 < NTYPES < 6 +*> DQK 19 List types on next line if 0 < NTYPES < 19 *> STZ 3 List types on next line if 0 < NTYPES < 3 *> SLS 6 List types on next line if 0 < NTYPES < 6 *> SEQ @@ -147,11 +148,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL E( NMAX ), S( 2*NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + REAL, DIMENSION(:), ALLOCATABLE :: E REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. @@ -162,13 +163,13 @@ * .. External Subroutines .. EXTERNAL ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ, $ SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP, - $ SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP, - $ SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, - $ SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, - $ SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, - $ SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK, - $ SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, - $ SCHKLQT, SCHKTSQR + $ SCHKPT, SCHKQ3, SCHKQP3RK, SCHKQL, SCHKQR, + $ SCHKRQ, SCHKSP, SCHKSY, SCHKSY_ROOK, SCHKSY_RK, + $ SCHKSY_AA, SCHKTB, SCHKTP, SCHKTR, SCHKTZ, + $ SDRVGB, SDRVGE, SDRVGT, SDRVLS, SDRVPB, SDRVPO, + $ SDRVPP, SDRVPT, SDRVSP, SDRVSY, SDRVSY_ROOK, + $ SDRVSY_RK, SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, + $ SCHKQRTP, SCHKLQT, SCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -188,13 +189,17 @@ * .. * .. Allocate memory dynamically .. * - ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus ) + ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. @@ -920,6 +925,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), B( 1, 4 ), + $ WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN * @@ -1048,6 +1070,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/schklq.f b/lapack-netlib/TESTING/LIN/schklq.f index cd66e8d10..9335503f9 100644 --- a/lapack-netlib/TESTING/LIN/schklq.f +++ b/lapack-netlib/TESTING/LIN/schklq.f @@ -235,7 +235,7 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, SERRLQ, SGELQS, SGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, SERRLQ, SGET02, $ SLACPY, SLARHS, SLATB4, SLATMS, SLQT01, SLQT02, $ SLQT03, XLAENV * .. @@ -370,7 +370,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call SGELQS to solve a system +* If M<=N and K=M, call SGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -387,14 +387,20 @@ * CALL SLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'SGELQS' - CALL SGELQS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from SGELQS. +* Reset AF to the original matrix. SGELS +* factors the matrix before solving the system. +* + CALL SLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'SGELS' + CALL SGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from SGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'SGELQS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'SGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/schkqp3rk.f b/lapack-netlib/TESTING/LIN/schkqp3rk.f new file mode 100644 index 000000000..36cf9370e --- /dev/null +++ b/lapack-netlib/TESTING/LIN/schkqp3rk.f @@ -0,0 +1,831 @@ +*> \brief \b SCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNS, NNB, NOUT +* REAL THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ), +* $ NVAL( * ), NXVAL( * ) +* REAL A( * ), COPYA( * ), B( * ), COPYB( * ), +* $ S( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SCHKQP3RK tests SGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is REAL array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is REAL array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is REAL array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is REAL array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is REAL array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension +*> (MMAX*NMAX + 4*NMAX + MMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup single_lin +* +* ===================================================================== + SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + REAL THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + REAL A( * ), COPYA( * ), B( * ), COPYB( * ), + $ S( * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + REAL ONE, ZERO, BIGNUM + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0, + $ BIGNUM = 1.0E+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + REAL ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + REAL RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE + EXTERNAL SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, SAXPY, SGEQP3RK, + $ SLACPY, SLAORD, SLASET, SLATB4, SLATMS, + $ SORMQR, SSWAP, ICOPY, XLAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, MOD, REAL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Single precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = SLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with SLATB4 and generate +* M-by-NRHS B matrix with SLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL SLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' + CALL SLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) + + +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL SLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with SLATB4 and generate a test +* matrix with SLATMS. +* + CALL SLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' + CALL SLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL SLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL SLASET( 'Full', M, NB_ZERO, ZERO, ZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL SLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL SLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL SSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL SSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for SGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL SLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL SLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL SLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOL = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute SGEQP3RK factorization of A. +* + SRNAMT = 'SGEQP3RK' + CALL SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, IWORK( 2*N+1 ), INFO ) +* +* Check error code from SGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'SGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + IF( KFACT.EQ.MINMN ) THEN +* + RESULT( 1 ) = SQRT12( M, N, A, LDA, S, WORK, + $ LWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF +* +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = SQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = SQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 + + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL SORMQR( 'Left', 'Transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL SAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ SLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of SCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/schkqr.f b/lapack-netlib/TESTING/LIN/schkqr.f index 5c45ede9b..f72c8f1eb 100644 --- a/lapack-netlib/TESTING/LIN/schkqr.f +++ b/lapack-netlib/TESTING/LIN/schkqr.f @@ -244,7 +244,7 @@ EXTERNAL SGENND * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, SERRQR, SGEQRS, SGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, SERRQR, SGELS, SGET02, $ SLACPY, SLARHS, SLATB4, SLATMS, SQRT01, $ SQRT01P, SQRT02, SQRT03, XLAENV * .. @@ -388,7 +388,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call SGEQRS to solve a system +* If M>=N and K=N, call SGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -405,14 +405,20 @@ * CALL SLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'SGEQRS' - CALL SGEQRS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from SGEQRS. +* Reset AF to the original matrix. SGELS +* factors the matrix before solving the system. +* + CALL SLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'SGELS' + CALL SGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from SGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'SGEQRS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'SGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f index d3c27ae56..6490cd7c3 100644 --- a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f @@ -423,9 +423,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'SSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -505,7 +505,6 @@ CALL SLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'SSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL SSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/schktp.f b/lapack-netlib/TESTING/LIN/schktp.f index ff05c1d97..a5243f651 100644 --- a/lapack-netlib/TESTING/LIN/schktp.f +++ b/lapack-netlib/TESTING/LIN/schktp.f @@ -86,7 +86,7 @@ *> \verbatim *> NMAX is INTEGER *> The leading dimension of the work arrays. NMAX >= the -*> maximumm value of N in NVAL. +*> maximum value of N in NVAL. *> \endverbatim *> *> \param[out] AP diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f index aff32bce9..319b90805 100644 --- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using SSYSV_AA. * SRNAMT = 'SSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/serrlq.f b/lapack-netlib/TESTING/LIN/serrlq.f index 5bb0fe201..e5df8ce52 100644 --- a/lapack-netlib/TESTING/LIN/serrlq.f +++ b/lapack-netlib/TESTING/LIN/serrlq.f @@ -76,7 +76,7 @@ $ W( NMAX ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SGELQ2, SGELQF, SGELQS, SORGL2, + EXTERNAL ALAESM, CHKXER, SGELQ2, SGELQF, SORGL2, $ SORGLQ, SORML2, SORMLQ * .. * .. Scalars in Common .. @@ -140,31 +140,6 @@ CALL SGELQ2( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'SGELQ2', INFOT, NOUT, LERR, OK ) * -* SGELQS -* - SRNAMT = 'SGELQS' - INFOT = 1 - CALL SGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL SGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL SGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL SGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL SGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL SGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL SGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) -* * SORGLQ * SRNAMT = 'SORGLQ' diff --git a/lapack-netlib/TESTING/LIN/serrqr.f b/lapack-netlib/TESTING/LIN/serrqr.f index 1ad40b7aa..e228813f7 100644 --- a/lapack-netlib/TESTING/LIN/serrqr.f +++ b/lapack-netlib/TESTING/LIN/serrqr.f @@ -77,7 +77,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, SGEQR2, SGEQR2P, SGEQRF, - $ SGEQRFP, SGEQRS, SORG2R, SORGQR, SORM2R, + $ SGEQRFP, SORG2R, SORGQR, SORM2R, $ SORMQR * .. * .. Scalars in Common .. @@ -170,31 +170,6 @@ CALL SGEQR2P( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'SGEQR2P', INFOT, NOUT, LERR, OK ) * -* SGEQRS -* - SRNAMT = 'SGEQRS' - INFOT = 1 - CALL SGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL SGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL SGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL SGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL SGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL SGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL SGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) -* * SORGQR * SRNAMT = 'SORGQR' diff --git a/lapack-netlib/TESTING/LIN/serrsy.f b/lapack-netlib/TESTING/LIN/serrsy.f index c562b417b..ab422ba5c 100644 --- a/lapack-netlib/TESTING/LIN/serrsy.f +++ b/lapack-netlib/TESTING/LIN/serrsy.f @@ -133,7 +133,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) pivoting. * * SSYTRF @@ -581,7 +581,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * SSPTRF diff --git a/lapack-netlib/TESTING/LIN/serrsyx.f b/lapack-netlib/TESTING/LIN/serrsyx.f index ed47c37bd..cffd28a52 100644 --- a/lapack-netlib/TESTING/LIN/serrsyx.f +++ b/lapack-netlib/TESTING/LIN/serrsyx.f @@ -137,7 +137,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) pivoting. * * SSYTRF @@ -527,7 +527,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * SSPTRF diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f index af1ce0a8e..391b54c3f 100644 --- a/lapack-netlib/TESTING/LIN/serrtr.f +++ b/lapack-netlib/TESTING/LIN/serrtr.f @@ -71,7 +71,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - REAL RCOND, SCALE + REAL RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. INTEGER IW( NMAX ) @@ -250,40 +250,40 @@ * SRNAMT = 'SLATRS3' INFOT = 1 - CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 0, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 0, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/serrvx.f b/lapack-netlib/TESTING/LIN/serrvx.f index c3db47332..440f9113e 100644 --- a/lapack-netlib/TESTING/LIN/serrvx.f +++ b/lapack-netlib/TESTING/LIN/serrvx.f @@ -582,6 +582,9 @@ INFOT = 3 CALL SSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK ) @@ -647,6 +650,9 @@ INFOT = 3 CALL SSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK ) @@ -694,7 +700,7 @@ * ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN * -* SSYSV_AA +* SSYSV_AASEN * SRNAMT = 'SSYSV_AA' INFOT = 1 @@ -706,13 +712,19 @@ INFOT = 3 CALL SSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN * -* DSYSV_AASEN_2STAGE +* SSYSV_AASEN_2STAGE * SRNAMT = 'SSYSV_AA_2STAGE' INFOT = 1 @@ -729,14 +741,18 @@ CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) * diff --git a/lapack-netlib/TESTING/LIN/sgtt01.f b/lapack-netlib/TESTING/LIN/sgtt01.f index 5d88c91f7..5d639af99 100644 --- a/lapack-netlib/TESTING/LIN/sgtt01.f +++ b/lapack-netlib/TESTING/LIN/sgtt01.f @@ -39,7 +39,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/sgtt02.f b/lapack-netlib/TESTING/LIN/sgtt02.f index a4eb09f6b..6daea295f 100644 --- a/lapack-netlib/TESTING/LIN/sgtt02.f +++ b/lapack-netlib/TESTING/LIN/sgtt02.f @@ -41,14 +41,14 @@ *> \verbatim *> TRANS is CHARACTER *> Specifies the form of the residual. -*> = 'N': B - A * X (No transpose) +*> = 'N': B - A * X (No transpose) *> = 'T': B - A**T * X (Transpose) *> = 'C': B - A**H * X (Conjugate transpose = Transpose) *> \endverbatim *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/slatb4.f b/lapack-netlib/TESTING/LIN/slatb4.f index 94d29db40..72a310727 100644 --- a/lapack-netlib/TESTING/LIN/slatb4.f +++ b/lapack-netlib/TESTING/LIN/slatb4.f @@ -153,9 +153,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL SLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -173,11 +170,6 @@ BADC1 = SQRT( BADC2 ) SMALL = SLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL SLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -232,6 +224,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -518,17 +614,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/slqt02.f b/lapack-netlib/TESTING/LIN/slqt02.f index d59a8a01c..1c7bd4f0d 100644 --- a/lapack-netlib/TESTING/LIN/slqt02.f +++ b/lapack-netlib/TESTING/LIN/slqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> SLQT02 tests SORGLQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the LQ factorization of an m-by-n matrix A, SLQT02 generates diff --git a/lapack-netlib/TESTING/LIN/sptt01.f b/lapack-netlib/TESTING/LIN/sptt01.f index fef18dd2b..5b33bfad5 100644 --- a/lapack-netlib/TESTING/LIN/sptt01.f +++ b/lapack-netlib/TESTING/LIN/sptt01.f @@ -35,7 +35,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/sptt02.f b/lapack-netlib/TESTING/LIN/sptt02.f index 51a6a074a..2bd5535aa 100644 --- a/lapack-netlib/TESTING/LIN/sptt02.f +++ b/lapack-netlib/TESTING/LIN/sptt02.f @@ -35,7 +35,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/sqlt02.f b/lapack-netlib/TESTING/LIN/sqlt02.f index 5d381b950..f26352eb3 100644 --- a/lapack-netlib/TESTING/LIN/sqlt02.f +++ b/lapack-netlib/TESTING/LIN/sqlt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> SQLT02 tests SORGQL, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QL factorization of an m-by-n matrix A, SQLT02 generates diff --git a/lapack-netlib/TESTING/LIN/sqpt01.f b/lapack-netlib/TESTING/LIN/sqpt01.f index de0c80e53..f53686a65 100644 --- a/lapack-netlib/TESTING/LIN/sqpt01.f +++ b/lapack-netlib/TESTING/LIN/sqpt01.f @@ -33,7 +33,8 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,28 @@ * NORMA = SLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL SCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL SORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL SAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ), $ 1 ) - 50 CONTINUE + END DO * SQPT01 = SLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/sqrt02.f b/lapack-netlib/TESTING/LIN/sqrt02.f index 72163f0a9..44b9c6270 100644 --- a/lapack-netlib/TESTING/LIN/sqrt02.f +++ b/lapack-netlib/TESTING/LIN/sqrt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> SQRT02 tests SORGQR, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QR factorization of an m-by-n matrix A, SQRT02 generates diff --git a/lapack-netlib/TESTING/LIN/sqrt11.f b/lapack-netlib/TESTING/LIN/sqrt11.f index d4422dacb..a3753adcf 100644 --- a/lapack-netlib/TESTING/LIN/sqrt11.f +++ b/lapack-netlib/TESTING/LIN/sqrt11.f @@ -157,9 +157,9 @@ CALL SORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M, $ WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * SQRT11 = SLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/sqrt12.f b/lapack-netlib/TESTING/LIN/sqrt12.f index 2eab0ee0d..46b359e07 100644 --- a/lapack-netlib/TESTING/LIN/sqrt12.f +++ b/lapack-netlib/TESTING/LIN/sqrt12.f @@ -26,7 +26,7 @@ *> SQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s ||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -113,8 +113,7 @@ EXTERNAL SASUM, SLAMCH, SLANGE, SNRM2 * .. * .. External Subroutines .. - EXTERNAL SAXPY, SBDSQR, SGEBD2, SLABAD, SLASCL, SLASET, - $ XERBLA + EXTERNAL SAXPY, SBDSQR, SGEBD2, SLASCL, SLASET, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, REAL @@ -145,17 +144,16 @@ * Copy upper triangle of A into work * CALL SLASET( 'Full', M, N, ZERO, ZERO, WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -199,9 +197,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN WORK( M*N+I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work diff --git a/lapack-netlib/TESTING/LIN/srqt02.f b/lapack-netlib/TESTING/LIN/srqt02.f index ca0594f7a..a33c98ba4 100644 --- a/lapack-netlib/TESTING/LIN/srqt02.f +++ b/lapack-netlib/TESTING/LIN/srqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> SRQT02 tests SORGRQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the RQ factorization of an m-by-n matrix A, SRQT02 generates diff --git a/lapack-netlib/TESTING/LIN/srzt01.f b/lapack-netlib/TESTING/LIN/srzt01.f index 4478c0506..a2b8b29ba 100644 --- a/lapack-netlib/TESTING/LIN/srzt01.f +++ b/lapack-netlib/TESTING/LIN/srzt01.f @@ -158,7 +158,7 @@ * * R = R * P(1) * ... *P(m) * - CALL SORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU, + CALL SORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU, $ WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO ) * * R = R - A diff --git a/lapack-netlib/TESTING/LIN/ssyt01_3.f b/lapack-netlib/TESTING/LIN/ssyt01_3.f index 147553db9..951fcb7d6 100644 --- a/lapack-netlib/TESTING/LIN/ssyt01_3.f +++ b/lapack-netlib/TESTING/LIN/ssyt01_3.f @@ -183,7 +183,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL SSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F index a118515a5..f1020f2d8 100644 --- a/lapack-netlib/TESTING/LIN/zchkaa.F +++ b/lapack-netlib/TESTING/LIN/zchkaa.F @@ -69,6 +69,7 @@ *> ZLQ 8 List types on next line if 0 < NTYPES < 8 *> ZQL 8 List types on next line if 0 < NTYPES < 8 *> ZQP 6 List types on next line if 0 < NTYPES < 6 +*> ZQK 19 List types on next line if 0 < NTYPES < 19 *> ZTZ 3 List types on next line if 0 < NTYPES < 3 *> ZLS 6 List types on next line if 0 < NTYPES < 6 *> ZEQ @@ -153,12 +154,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION S( 2*NMAX ) - COMPLEX*16 E( NMAX ) -* -* .. Allocatable Arrays .. +* .. +* .. Allocatable Arrays .. INTEGER AllocateStatus - DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK, S + COMPLEX*16, DIMENSION(:), ALLOCATABLE :: E COMPLEX*16, DIMENSION(:,:), ALLOCATABLE:: A, B, WORK * .. * .. External Functions .. @@ -170,15 +170,16 @@ EXTERNAL ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE, $ ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP, $ ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS, - $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, - $ ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK, - $ ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ, - $ ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK, - $ ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP, - $ ZDRVLS, ZDRVPB, ZDRVPO, ZDRVPP, ZDRVPT, - $ ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK, - $ ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, - $ ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR + $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQP3RK, ZCHKQL, + $ ZCHKQR, ZCHKRQ, ZCHKSP, ZCHKSY, ZCHKSY_ROOK, + $ ZCHKSY_RK, ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, + $ ZCHKTZ, ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, + $ ZDRVHE_ROOK, ZDRVHE_RK, ZDRVHE_AA, + $ ZDRVHE_AA_2STAGE, ZDRVHP, ZDRVLS, ZDRVPB, + $ ZDRVPO, ZDRVPP, ZDRVPT, ZDRVSP, ZDRVSY, + $ ZDRVSY_ROOK, ZDRVSY_RK, ZDRVSY_AA, + $ ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, ZCHKQRTP, + $ ZCHKLQT, ZCHKLQTP, ZCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -197,13 +198,18 @@ DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / * * .. Allocate memory dynamically .. - ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) +* + ALLOCATE ( A ( (KDMAX+1) * NMAX, 7 ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B ( NMAX * MAXRHS, 4 ), STAT = AllocateStatus) + IF (AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( WORK ( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus) + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus) + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus) + ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. @@ -1109,6 +1115,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ S( 1 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN * @@ -1245,6 +1268,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f index 381fac9f2..51082f1d0 100644 --- a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f @@ -8,7 +8,7 @@ * Definition: * =========== * -* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, +* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, * NNS, NSVAL, THRESH, TSTERR, NMAX, A, * AFAC, AINV, B, X, XACT, WORK, RWORK, * IWORK, NOUT ) @@ -185,7 +185,8 @@ LOGICAL DOTYPE( * ) INTEGER IWORK( * ), NBVAL( * ), NSVAL( * ), NVAL( * ) COMPLEX*16 A( * ), AFAC( * ), AINV( * ), B( * ), - $ RWORK( * ), WORK( * ), X( * ), XACT( * ) + $ WORK( * ), X( * ), XACT( * ) + DOUBLE PRECISION RWORK( * ) * .. * * ===================================================================== @@ -430,9 +431,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'ZHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) + CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/TESTING/LIN/zchklq.f b/lapack-netlib/TESTING/LIN/zchklq.f index 371bb946b..ccef7b803 100644 --- a/lapack-netlib/TESTING/LIN/zchklq.f +++ b/lapack-netlib/TESTING/LIN/zchklq.f @@ -235,7 +235,7 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZERRLQ, ZGELQS, + EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZERRLQ, ZGELS, $ ZGET02, ZLACPY, ZLARHS, ZLATB4, ZLATMS, ZLQT01, $ ZLQT02, ZLQT03 * .. @@ -370,7 +370,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call ZGELQS to solve a system +* If M<=N and K=M, call ZGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -387,14 +387,20 @@ * CALL ZLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'ZGELQS' - CALL ZGELQS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from ZGELQS. +* Reset AF to the original matrix. ZGELS +* factors the matrix before solving the system. +* + CALL ZLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'ZGELS' + CALL ZGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from ZGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'ZGELQS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'ZGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/zchkqp3rk.f b/lapack-netlib/TESTING/LIN/zchkqp3rk.f new file mode 100644 index 000000000..302c7b1a8 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zchkqp3rk.f @@ -0,0 +1,836 @@ +*> \brief \b ZCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, RWORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNB, NOUT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ), +* $ NXVAL( * ) +* DOUBLE PRECISION S( * ), RWORK( * ) +* COMPLEX*16 A( * ), COPYA( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZCHKQP3RK tests ZGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is COMPLEX*16 array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is COMPLEX*16 array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is DOUBLE PRECISION array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension +*> (max(M*max(M,N) + 4*min(M,N) + max(M,N))) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (4*NMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, RWORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + DOUBLE PRECISION S( * ), RWORK( * ) + COMPLEX*16 A( * ), COPYA( * ), B( * ), COPYB( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + DOUBLE PRECISION ONE, ZERO, BIGNUM + COMPLEX*16 CONE, CZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0, + $ CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ), + $ BIGNUM = 1.0D+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + DOUBLE PRECISION ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + DOUBLE PRECISION RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE + EXTERNAL DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, DLAORD, ICOPY, ZAXPY, + $ XLAENV, ZGEQP3RK, ZLACPY, ZLASET, ZLATB4, + $ ZLATMS, ZUNMQR, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, MIN, MOD +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT, ZUNMQR_LWORK +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Zomplex precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = DLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with ZLATB4 and generate +* M-by-NRHS B matrix with ZLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL ZLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' + CALL ZLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL ZLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with ZLATMS. +* + CALL ZLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' + CALL ZLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL DLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL ZLASET( 'Full', M, NB_ZERO, CZERO, CZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL ZLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL ZLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL ZSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL ZSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + CALL DLAORD( 'Decreasing', MINMNB_GEN, S, 1 ) + + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL ZLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL ZLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL ZLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOl = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute ZGEQP3RK factorization of A. +* + SRNAMT = 'ZGEQP3RK' + CALL ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, RWORK, IWORK( 2*N+1 ), + $ INFO ) +* +* Check error code from ZGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'ZGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* + IF( KFACT.EQ.MINMN ) THEN +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + RESULT( 1 ) = ZQRT12( M, N, A, LDA, S, WORK, + $ LWORK , RWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF + +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = ZQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = ZQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 +* + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL ZUNMQR( 'Left', 'Conjugate transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL ZAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ ZLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of ZCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/zchkqr.f b/lapack-netlib/TESTING/LIN/zchkqr.f index a240d2da5..c088bacc9 100644 --- a/lapack-netlib/TESTING/LIN/zchkqr.f +++ b/lapack-netlib/TESTING/LIN/zchkqr.f @@ -244,7 +244,7 @@ EXTERNAL ZGENND * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZERRQR, ZGEQRS, + EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZERRQR, ZGELS, $ ZGET02, ZLACPY, ZLARHS, ZLATB4, ZLATMS, ZQRT01, $ ZQRT01P, ZQRT02, ZQRT03 * .. @@ -388,7 +388,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call ZGEQRS to solve a system +* If M>=N and K=N, call ZGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -405,14 +405,20 @@ * CALL ZLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'ZGEQRS' - CALL ZGEQRS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from ZGEQRS. +* Reset AF to the original matrix. ZGELS +* factors the matrix before solving the system. +* + CALL ZLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'ZGELS' + CALL ZGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from ZGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'ZGEQRS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'ZGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/zchktp.f b/lapack-netlib/TESTING/LIN/zchktp.f index 1798c24e7..ab46f5ce5 100644 --- a/lapack-netlib/TESTING/LIN/zchktp.f +++ b/lapack-netlib/TESTING/LIN/zchktp.f @@ -87,7 +87,7 @@ *> \verbatim *> NMAX is INTEGER *> The leading dimension of the work arrays. NMAX >= the -*> maximumm value of N in NVAL. +*> maximum value of N in NVAL. *> \endverbatim *> *> \param[out] AP diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f index 275ca2857..4af538124 100644 --- a/lapack-netlib/TESTING/LIN/zchktr.f +++ b/lapack-netlib/TESTING/LIN/zchktr.f @@ -201,7 +201,8 @@ * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) + DOUBLE PRECISION RESULT( NTESTS ), RWORK2( 2*NMAX), + $ SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -544,8 +545,8 @@ CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, - $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, - $ INFO ) + $ B, MAX(1, N), SCALE3, RWORK, RWORK2, + $ 2*NMAX, INFO ) * * Check error code from ZLATRS3. * diff --git a/lapack-netlib/TESTING/LIN/zdrvab.f b/lapack-netlib/TESTING/LIN/zdrvab.f index 130515959..772eb08af 100644 --- a/lapack-netlib/TESTING/LIN/zdrvab.f +++ b/lapack-netlib/TESTING/LIN/zdrvab.f @@ -348,7 +348,7 @@ CALL ZGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK, $ LDA, RWORK, RESULT( 1 ) ) * -* Check if the test passes the tesing. +* Check if the test passes the testing. * Print information about the tests that did not * pass the testing. * diff --git a/lapack-netlib/TESTING/LIN/zdrvac.f b/lapack-netlib/TESTING/LIN/zdrvac.f index 20f8eb1e5..bbf73a263 100644 --- a/lapack-netlib/TESTING/LIN/zdrvac.f +++ b/lapack-netlib/TESTING/LIN/zdrvac.f @@ -367,7 +367,7 @@ CALL ZPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK, $ LDA, RWORK, RESULT( 1 ) ) * -* Check if the test passes the tesing. +* Check if the test passes the testing. * Print information about the tests that did not * pass the testing. * diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f index 9401867e0..fcd774491 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using ZHESV_AA. * SRNAMT = 'ZHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL ZHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f index 21497477c..5406a76bf 100644 --- a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f @@ -229,7 +229,7 @@ * Test path * PATH( 1: 1 ) = 'Zomplex precision' - PATH( 2: 3 ) = 'H2' + PATH( 2: 3 ) = 'S2' * * Path to generate matrices * diff --git a/lapack-netlib/TESTING/LIN/zerrhe.f b/lapack-netlib/TESTING/LIN/zerrhe.f index e49e5037c..40dd25d75 100644 --- a/lapack-netlib/TESTING/LIN/zerrhe.f +++ b/lapack-netlib/TESTING/LIN/zerrhe.f @@ -135,7 +135,7 @@ IF( LSAMEN( 2, C2, 'HE' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite matrix with patrial +* of a Hermitian indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZHETRF @@ -580,7 +580,7 @@ ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite packed matrix with patrial +* of a Hermitian indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZHPTRF diff --git a/lapack-netlib/TESTING/LIN/zerrhex.f b/lapack-netlib/TESTING/LIN/zerrhex.f index b6e8b77ef..bdcdefff7 100644 --- a/lapack-netlib/TESTING/LIN/zerrhex.f +++ b/lapack-netlib/TESTING/LIN/zerrhex.f @@ -138,7 +138,7 @@ OK = .TRUE. * * Test error exits of the routines that use factorization -* of a Hermitian indefinite matrix with patrial +* of a Hermitian indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * IF( LSAMEN( 2, C2, 'HE' ) ) THEN @@ -526,7 +526,7 @@ ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite packed matrix with patrial +* of a Hermitian indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZHPTRF diff --git a/lapack-netlib/TESTING/LIN/zerrlq.f b/lapack-netlib/TESTING/LIN/zerrlq.f index d8e5a8fe8..d91b4e4b3 100644 --- a/lapack-netlib/TESTING/LIN/zerrlq.f +++ b/lapack-netlib/TESTING/LIN/zerrlq.f @@ -76,7 +76,7 @@ $ W( NMAX ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZGELQ2, ZGELQF, ZGELQS, ZUNGL2, + EXTERNAL ALAESM, CHKXER, ZGELQ2, ZGELQF, ZUNGL2, $ ZUNGLQ, ZUNML2, ZUNMLQ * .. * .. Scalars in Common .. @@ -142,31 +142,6 @@ CALL ZGELQ2( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'ZGELQ2', INFOT, NOUT, LERR, OK ) * -* ZGELQS -* - SRNAMT = 'ZGELQS' - INFOT = 1 - CALL ZGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL ZGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL ZGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL ZGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL ZGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) -* * ZUNGLQ * SRNAMT = 'ZUNGLQ' diff --git a/lapack-netlib/TESTING/LIN/zerrqr.f b/lapack-netlib/TESTING/LIN/zerrqr.f index 114453d4c..3542c7a04 100644 --- a/lapack-netlib/TESTING/LIN/zerrqr.f +++ b/lapack-netlib/TESTING/LIN/zerrqr.f @@ -77,7 +77,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, ZGEQR2, ZGEQR2P, ZGEQRF, - $ ZGEQRFP, ZGEQRS, ZUNG2R, ZUNGQR, ZUNM2R, + $ ZGEQRFP, ZUNG2R, ZUNGQR, ZUNM2R, $ ZUNMQR * .. * .. Scalars in Common .. @@ -172,31 +172,6 @@ CALL ZGEQR2P( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'ZGEQR2P', INFOT, NOUT, LERR, OK ) * -* ZGEQRS -* - SRNAMT = 'ZGEQRS' - INFOT = 1 - CALL ZGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL ZGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL ZGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL ZGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL ZGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) -* * ZUNGQR * SRNAMT = 'ZUNGQR' diff --git a/lapack-netlib/TESTING/LIN/zerrsy.f b/lapack-netlib/TESTING/LIN/zerrsy.f index a50f9d24e..932df1936 100644 --- a/lapack-netlib/TESTING/LIN/zerrsy.f +++ b/lapack-netlib/TESTING/LIN/zerrsy.f @@ -132,7 +132,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZSYTRF @@ -471,7 +471,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * ZSPTRF diff --git a/lapack-netlib/TESTING/LIN/zerrsyx.f b/lapack-netlib/TESTING/LIN/zerrsyx.f index 23d2a5a2b..9d5e71288 100644 --- a/lapack-netlib/TESTING/LIN/zerrsyx.f +++ b/lapack-netlib/TESTING/LIN/zerrsyx.f @@ -139,7 +139,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZSYTRF @@ -525,7 +525,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * ZSPTRF diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f index 211b92154..640c39793 100644 --- a/lapack-netlib/TESTING/LIN/zerrtr.f +++ b/lapack-netlib/TESTING/LIN/zerrtr.f @@ -70,7 +70,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - DOUBLE PRECISION RCOND, SCALE + DOUBLE PRECISION RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. DOUBLE PRECISION R1( NMAX ), R2( NMAX ), RW( NMAX ) @@ -245,40 +245,40 @@ * SRNAMT = 'ZLATRS3' INFOT = 1 - CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 0, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 0, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) * * Test error exits for the packed triangular routines. diff --git a/lapack-netlib/TESTING/LIN/zerrvx.f b/lapack-netlib/TESTING/LIN/zerrvx.f index ea7823df3..80b3aaf4a 100644 --- a/lapack-netlib/TESTING/LIN/zerrvx.f +++ b/lapack-netlib/TESTING/LIN/zerrvx.f @@ -653,6 +653,9 @@ INFOT = 3 CALL ZHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK ) @@ -700,21 +703,27 @@ * ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN * -* ZHESV_AA -* - SRNAMT = 'ZHESV_AA' - INFOT = 1 - CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) +* ZHESV_AASEN +* + SRNAMT = 'ZHESV_AA' + INFOT = 1 + CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN * @@ -735,16 +744,44 @@ CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* ZSYSV_AASEN +* + SRNAMT = 'ZSYSV_AA' + INFOT = 1 + CALL ZSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN * @@ -765,17 +802,21 @@ CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) -** +* ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * ZHPSV @@ -836,6 +877,9 @@ INFOT = 3 CALL ZSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK ) @@ -900,6 +944,9 @@ INFOT = 3 CALL ZSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK ) diff --git a/lapack-netlib/TESTING/LIN/zgtt01.f b/lapack-netlib/TESTING/LIN/zgtt01.f index c63fdbc25..a8ac13d2d 100644 --- a/lapack-netlib/TESTING/LIN/zgtt01.f +++ b/lapack-netlib/TESTING/LIN/zgtt01.f @@ -39,7 +39,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/zgtt02.f b/lapack-netlib/TESTING/LIN/zgtt02.f index 7362967be..f86fe9244 100644 --- a/lapack-netlib/TESTING/LIN/zgtt02.f +++ b/lapack-netlib/TESTING/LIN/zgtt02.f @@ -40,14 +40,14 @@ *> \verbatim *> TRANS is CHARACTER *> Specifies the form of the residual. -*> = 'N': B - A * X (No transpose) +*> = 'N': B - A * X (No transpose) *> = 'T': B - A**T * X (Transpose) *> = 'C': B - A**H * X (Conjugate transpose) *> \endverbatim *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/zhet01_3.f b/lapack-netlib/TESTING/LIN/zhet01_3.f index 0a76404d6..5beed9042 100644 --- a/lapack-netlib/TESTING/LIN/zhet01_3.f +++ b/lapack-netlib/TESTING/LIN/zhet01_3.f @@ -188,7 +188,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zlatb4.f b/lapack-netlib/TESTING/LIN/zlatb4.f index a6977f4e9..a2b19f83d 100644 --- a/lapack-netlib/TESTING/LIN/zlatb4.f +++ b/lapack-netlib/TESTING/LIN/zlatb4.f @@ -154,9 +154,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL DLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -174,11 +171,6 @@ BADC1 = SQRT( BADC2 ) SMALL = DLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL DLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -233,6 +225,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -517,17 +613,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/zlqt02.f b/lapack-netlib/TESTING/LIN/zlqt02.f index c55d76ccd..9ba98ff1f 100644 --- a/lapack-netlib/TESTING/LIN/zlqt02.f +++ b/lapack-netlib/TESTING/LIN/zlqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> ZLQT02 tests ZUNGLQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the LQ factorization of an m-by-n matrix A, ZLQT02 generates diff --git a/lapack-netlib/TESTING/LIN/zptt01.f b/lapack-netlib/TESTING/LIN/zptt01.f index e842c7e46..9f2359c2a 100644 --- a/lapack-netlib/TESTING/LIN/zptt01.f +++ b/lapack-netlib/TESTING/LIN/zptt01.f @@ -36,7 +36,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/zptt02.f b/lapack-netlib/TESTING/LIN/zptt02.f index 6e3a8aed0..8a1e2961c 100644 --- a/lapack-netlib/TESTING/LIN/zptt02.f +++ b/lapack-netlib/TESTING/LIN/zptt02.f @@ -46,7 +46,7 @@ *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/zqlt02.f b/lapack-netlib/TESTING/LIN/zqlt02.f index 1f84cfa5a..7cb889931 100644 --- a/lapack-netlib/TESTING/LIN/zqlt02.f +++ b/lapack-netlib/TESTING/LIN/zqlt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> ZQLT02 tests ZUNGQL, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QL factorization of an m-by-n matrix A, ZQLT02 generates diff --git a/lapack-netlib/TESTING/LIN/zqpt01.f b/lapack-netlib/TESTING/LIN/zqpt01.f index 4e53f92c8..c69eb658f 100644 --- a/lapack-netlib/TESTING/LIN/zqpt01.f +++ b/lapack-netlib/TESTING/LIN/zqpt01.f @@ -33,7 +33,7 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) *> \endverbatim * * Arguments: @@ -172,28 +172,28 @@ * NORMA = ZLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL ZCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL ZUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL ZAXPY( M, DCMPLX( -ONE ), A( 1, JPVT( J ) ), 1, $ WORK( ( J-1 )*M+1 ), 1 ) - 50 CONTINUE + END DO * ZQPT01 = ZLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/zqrt02.f b/lapack-netlib/TESTING/LIN/zqrt02.f index 2dbefaf84..a32703c49 100644 --- a/lapack-netlib/TESTING/LIN/zqrt02.f +++ b/lapack-netlib/TESTING/LIN/zqrt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> ZQRT02 tests ZUNGQR, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QR factorization of an m-by-n matrix A, ZQRT02 generates diff --git a/lapack-netlib/TESTING/LIN/zqrt11.f b/lapack-netlib/TESTING/LIN/zqrt11.f index c3be59c36..dc4af744f 100644 --- a/lapack-netlib/TESTING/LIN/zqrt11.f +++ b/lapack-netlib/TESTING/LIN/zqrt11.f @@ -158,9 +158,9 @@ CALL ZUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU, $ WORK, M, WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * ZQRT11 = ZLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/zqrt12.f b/lapack-netlib/TESTING/LIN/zqrt12.f index 0da6be157..91477b5ea 100644 --- a/lapack-netlib/TESTING/LIN/zqrt12.f +++ b/lapack-netlib/TESTING/LIN/zqrt12.f @@ -28,7 +28,7 @@ *> ZQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -125,8 +125,8 @@ EXTERNAL DASUM, DLAMCH, DNRM2, ZLANGE * .. * .. External Subroutines .. - EXTERNAL DAXPY, DBDSQR, DLABAD, DLASCL, XERBLA, ZGEBD2, - $ ZLASCL, ZLASET + EXTERNAL DAXPY, DBDSQR, DLASCL, XERBLA, ZGEBD2, ZLASCL, + $ ZLASET * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN @@ -154,17 +154,16 @@ * CALL ZLASET( 'Full', M, N, DCMPLX( ZERO ), DCMPLX( ZERO ), WORK, $ M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -208,9 +207,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN RWORK( I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work @@ -218,6 +217,7 @@ CALL DAXPY( MN, -ONE, S, 1, RWORK( 1 ), 1 ) ZQRT12 = DASUM( MN, RWORK( 1 ), 1 ) / $ ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) ) +* IF( NRMSVL.NE.ZERO ) $ ZQRT12 = ZQRT12 / NRMSVL * diff --git a/lapack-netlib/TESTING/LIN/zrqt02.f b/lapack-netlib/TESTING/LIN/zrqt02.f index 548321d00..c44e04cb6 100644 --- a/lapack-netlib/TESTING/LIN/zrqt02.f +++ b/lapack-netlib/TESTING/LIN/zrqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> ZRQT02 tests ZUNGRQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the RQ factorization of an m-by-n matrix A, ZRQT02 generates diff --git a/lapack-netlib/TESTING/LIN/zrzt01.f b/lapack-netlib/TESTING/LIN/zrzt01.f index 2eba7ba6f..b6e4d73f9 100644 --- a/lapack-netlib/TESTING/LIN/zrzt01.f +++ b/lapack-netlib/TESTING/LIN/zrzt01.f @@ -159,7 +159,7 @@ * * R = R * P(1) * ... *P(m) * - CALL ZUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU, + CALL ZUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU, $ WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO ) * * R = R - A diff --git a/lapack-netlib/TESTING/LIN/zsyt01_3.f b/lapack-netlib/TESTING/LIN/zsyt01_3.f index 202488db9..a572353f2 100644 --- a/lapack-netlib/TESTING/LIN/zsyt01_3.f +++ b/lapack-netlib/TESTING/LIN/zsyt01_3.f @@ -188,7 +188,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/MATGEN/clagge.c b/lapack-netlib/TESTING/MATGEN/clagge.c index f05905bd7..62c33d01e 100644 --- a/lapack-netlib/TESTING/MATGEN/clagge.c +++ b/lapack-netlib/TESTING/MATGEN/clagge.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b CLARND */ diff --git a/lapack-netlib/TESTING/MATGEN/claror.c b/lapack-netlib/TESTING/MATGEN/claror.c index cd0d15300..b0d73f37c 100644 --- a/lapack-netlib/TESTING/MATGEN/claror.c +++ b/lapack-netlib/TESTING/MATGEN/claror.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b CLATM2 */ diff --git a/lapack-netlib/TESTING/MATGEN/clatm3.c b/lapack-netlib/TESTING/MATGEN/clatm3.c index 58cd4e551..fcd8dbfcb 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm3.c +++ b/lapack-netlib/TESTING/MATGEN/clatm3.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b CLATM3 */ diff --git a/lapack-netlib/TESTING/MATGEN/clatm5.c b/lapack-netlib/TESTING/MATGEN/clatm5.c index c2b81ccf3..8fbc1c0a6 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm5.c +++ b/lapack-netlib/TESTING/MATGEN/clatm5.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLARAN */ diff --git a/lapack-netlib/TESTING/MATGEN/dlarge.c b/lapack-netlib/TESTING/MATGEN/dlarge.c index 5d8a81387..5cc7fbce8 100644 --- a/lapack-netlib/TESTING/MATGEN/dlarge.c +++ b/lapack-netlib/TESTING/MATGEN/dlarge.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLARND */ diff --git a/lapack-netlib/TESTING/MATGEN/dlaror.c b/lapack-netlib/TESTING/MATGEN/dlaror.c index d9e2e46ae..fdd126174 100644 --- a/lapack-netlib/TESTING/MATGEN/dlaror.c +++ b/lapack-netlib/TESTING/MATGEN/dlaror.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} static double dpow_ui(double x, integer n) { double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,223 +273,6 @@ static double dpow_ui(double x, integer n) { } return pow; } -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLATM1 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm2.c b/lapack-netlib/TESTING/MATGEN/dlatm2.c index d74bc9168..7491e9829 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm2.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm2.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLATM2 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm3.c b/lapack-netlib/TESTING/MATGEN/dlatm3.c index 86f964ced..a9d26c7fc 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm3.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm3.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLATM3 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm5.c b/lapack-netlib/TESTING/MATGEN/dlatm5.c index 94b49d6e3..7f1c36428 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm5.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm5.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} static double dpow_ui(double x, integer n) { double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,223 +273,6 @@ static double dpow_ui(double x, integer n) { } return pow; } -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLATM7 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatme.c b/lapack-netlib/TESTING/MATGEN/dlatme.c index a92c70ef2..e29df164c 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatme.c +++ b/lapack-netlib/TESTING/MATGEN/dlatme.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLARAN */ diff --git a/lapack-netlib/TESTING/MATGEN/slarge.c b/lapack-netlib/TESTING/MATGEN/slarge.c index 6b37e9400..d5fbd541c 100644 --- a/lapack-netlib/TESTING/MATGEN/slarge.c +++ b/lapack-netlib/TESTING/MATGEN/slarge.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLARND */ diff --git a/lapack-netlib/TESTING/MATGEN/slaror.c b/lapack-netlib/TESTING/MATGEN/slaror.c index 48b532dfd..7e3065432 100644 --- a/lapack-netlib/TESTING/MATGEN/slaror.c +++ b/lapack-netlib/TESTING/MATGEN/slaror.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLATM1 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm2.c b/lapack-netlib/TESTING/MATGEN/slatm2.c index e7b72006f..833ee5dea 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm2.c +++ b/lapack-netlib/TESTING/MATGEN/slatm2.c @@ -261,253 +261,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLATM2 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm3.c b/lapack-netlib/TESTING/MATGEN/slatm3.c index 4f9f5fee2..cdf96ef51 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm3.c +++ b/lapack-netlib/TESTING/MATGEN/slatm3.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLATM3 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm5.c b/lapack-netlib/TESTING/MATGEN/slatm5.c index 24ee0915d..9122bc041 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm5.c +++ b/lapack-netlib/TESTING/MATGEN/slatm5.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLATM7 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatme.c b/lapack-netlib/TESTING/MATGEN/slatme.c index a8a6b39a3..126c42121 100644 --- a/lapack-netlib/TESTING/MATGEN/slatme.c +++ b/lapack-netlib/TESTING/MATGEN/slatme.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b ZLARND */ diff --git a/lapack-netlib/TESTING/MATGEN/zlaror.c b/lapack-netlib/TESTING/MATGEN/zlaror.c index 6ada57b8a..c8a84f215 100644 --- a/lapack-netlib/TESTING/MATGEN/zlaror.c +++ b/lapack-netlib/TESTING/MATGEN/zlaror.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} static double dpow_ui(double x, integer n) { double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,224 +273,6 @@ static double dpow_ui(double x, integer n) { } return pow; } -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b ZLATM2 */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatm3.c b/lapack-netlib/TESTING/MATGEN/zlatm3.c index c35ffe4d9..6370a9d39 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm3.c +++ b/lapack-netlib/TESTING/MATGEN/zlatm3.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b ZLATM3 */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatm5.c b/lapack-netlib/TESTING/MATGEN/zlatm5.c index 753ee0ce6..5ee6cc8ce 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm5.c +++ b/lapack-netlib/TESTING/MATGEN/zlatm5.c @@ -248,7 +248,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -262,254 +261,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i $@ 2>&1 + +sdmd.out: sdmd.in EIG/xdmdeigtsts + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtsts < sdmd.in > $@ 2>&1 # # ======== COMPLEX EIG TESTS =========================== @@ -379,6 +390,10 @@ ccsd.out: csd.in EIG/xeigtstc clse.out: lse.in EIG/xeigtstc @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstc < lse.in > $@ 2>&1 + +cdmd.out: cdmd.in EIG/xdmdeigtstc + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstc < cdmd.in > $@ 2>&1 # # ======== DOUBLE EIG TESTS =========================== @@ -461,6 +476,10 @@ dcsd.out: csd.in EIG/xeigtstd dlse.out: lse.in EIG/xeigtstd @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstd < lse.in > $@ 2>&1 + +ddmd.out: ddmd.in EIG/xdmdeigtstd + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstd < ddmd.in > $@ 2>&1 # # ======== COMPLEX16 EIG TESTS =========================== @@ -543,6 +562,10 @@ zcsd.out: csd.in EIG/xeigtstz zlse.out: lse.in EIG/xeigtstz @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstz < lse.in > $@ 2>&1 + +zdmd.out: zdmd.in EIG/xdmdeigtstz + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstz < zdmd.in > $@ 2>&1 # ============================================================================== LIN/xlintsts: $(FRCLIN) $(FRC) @@ -578,15 +601,27 @@ LIN/xlintstzc: $(FRCLIN) $(FRC) EIG/xeigtsts: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtsts +EIG/xdmdeigtsts: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtsts + EIG/xeigtstc: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstc +EIG/xdmdeigtstc: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstc + EIG/xeigtstd: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstd +EIG/xdmdeigtstd: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstd + EIG/xeigtstz: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstz +EIG/xdmdeigtstz: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstz + .PHONY: clean cleantest clean: cleantest cleantest: diff --git a/lapack-netlib/TESTING/cdmd.in b/lapack-netlib/TESTING/cdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/cdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/ctest.in b/lapack-netlib/TESTING/ctest.in index a3588b4a1..74ff31ab8 100644 --- a/lapack-netlib/TESTING/ctest.in +++ b/lapack-netlib/TESTING/ctest.in @@ -42,6 +42,7 @@ CRQ 8 List types on next line if 0 < NTYPES < 8 CLQ 8 List types on next line if 0 < NTYPES < 8 CQL 8 List types on next line if 0 < NTYPES < 8 CQP 6 List types on next line if 0 < NTYPES < 6 +CQK 19 List types on next line if 0 < NTYPES < 19 CTZ 3 List types on next line if 0 < NTYPES < 3 CLS 6 List types on next line if 0 < NTYPES < 6 CEQ diff --git a/lapack-netlib/TESTING/ddmd.in b/lapack-netlib/TESTING/ddmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/ddmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/dgd.in b/lapack-netlib/TESTING/dgd.in index ca24dd2a0..127b63789 100644 --- a/lapack-netlib/TESTING/dgd.in +++ b/lapack-netlib/TESTING/dgd.in @@ -1,6 +1,6 @@ DGS Data for the Real Nonsymmetric Schur Form Driver 5 Number of matrix dimensions -2 6 10 12 20 30 Matrix dimensions +2 10 12 20 30 Matrix dimensions 1 1 1 2 1 Parameters NB, NBMIN, NXOVER, NS, NBCOL 10 Threshold for test ratios .TRUE. Put T to test the error exits diff --git a/lapack-netlib/TESTING/dtest.in b/lapack-netlib/TESTING/dtest.in index 29bb8b92e..1b6c7bd4a 100644 --- a/lapack-netlib/TESTING/dtest.in +++ b/lapack-netlib/TESTING/dtest.in @@ -36,6 +36,7 @@ DRQ 8 List types on next line if 0 < NTYPES < 8 DLQ 8 List types on next line if 0 < NTYPES < 8 DQL 8 List types on next line if 0 < NTYPES < 8 DQP 6 List types on next line if 0 < NTYPES < 6 +DQK 19 LIst types on next line if 0 < NTYPES < 19 DTZ 3 List types on next line if 0 < NTYPES < 3 DLS 6 List types on next line if 0 < NTYPES < 6 DEQ diff --git a/lapack-netlib/TESTING/sdmd.in b/lapack-netlib/TESTING/sdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/sdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/sgd.in b/lapack-netlib/TESTING/sgd.in index 164b3cc35..efab6c257 100644 --- a/lapack-netlib/TESTING/sgd.in +++ b/lapack-netlib/TESTING/sgd.in @@ -1,6 +1,6 @@ SGS Data for the Real Nonsymmetric Schur Form Driver 5 Number of matrix dimensions -2 6 10 12 20 30 Matrix dimensions +2 10 12 20 30 Matrix dimensions 1 1 1 2 1 Parameters NB, NBMIN, NXOVER, NS, NBCOL 10 Threshold for test ratios .TRUE. Put T to test the error exits diff --git a/lapack-netlib/TESTING/stest.in b/lapack-netlib/TESTING/stest.in index 27ac30040..7faa8b7a1 100644 --- a/lapack-netlib/TESTING/stest.in +++ b/lapack-netlib/TESTING/stest.in @@ -36,6 +36,7 @@ SRQ 8 List types on next line if 0 < NTYPES < 8 SLQ 8 List types on next line if 0 < NTYPES < 8 SQL 8 List types on next line if 0 < NTYPES < 8 SQP 6 List types on next line if 0 < NTYPES < 6 +SQK 19 List types on next line if 0 < NTYPES < 19 STZ 3 List types on next line if 0 < NTYPES < 3 SLS 6 List types on next line if 0 < NTYPES < 6 SEQ diff --git a/lapack-netlib/TESTING/zdmd.in b/lapack-netlib/TESTING/zdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/zdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/ztest.in b/lapack-netlib/TESTING/ztest.in index 58da33d60..c83e82e45 100644 --- a/lapack-netlib/TESTING/ztest.in +++ b/lapack-netlib/TESTING/ztest.in @@ -42,6 +42,7 @@ ZRQ 8 List types on next line if 0 < NTYPES < 8 ZLQ 8 List types on next line if 0 < NTYPES < 8 ZQL 8 List types on next line if 0 < NTYPES < 8 ZQP 6 List types on next line if 0 < NTYPES < 6 +ZQK 19 List types on next line if 0 < NTYPES < 19 ZTZ 3 List types on next line if 0 < NTYPES < 3 ZLS 6 List types on next line if 0 < NTYPES < 6 ZEQ diff --git a/lapack-netlib/cgbsvx.f b/lapack-netlib/cgbsvx.f new file mode 100644 index 000000000..eaab5682c --- /dev/null +++ b/lapack-netlib/cgbsvx.f @@ -0,0 +1,644 @@ +*> \brief CGBSVX computes the solution to system of linear equations A * X = B for GB matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGBSVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, +* LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, +* RCOND, FERR, BERR, WORK, RWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS +* REAL RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ) +* REAL BERR( * ), C( * ), FERR( * ), R( * ), +* $ RWORK( * ) +* COMPLEX AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGBSVX uses the LU factorization to compute the solution to a complex +*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B, +*> where A is a band matrix of order N with KL subdiagonals and KU +*> superdiagonals, and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed by this subroutine: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = L * U, +*> where L is a product of permutation and unit lower triangular +*> matrices with KL subdiagonals, and U is upper triangular with +*> KL+KU superdiagonals. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AFB and IPIV contain the factored form of +*> A. If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> AB, AFB, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AFB and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AFB and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations. +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KL +*> \verbatim +*> KL is INTEGER +*> The number of subdiagonals within the band of A. KL >= 0. +*> \endverbatim +*> +*> \param[in] KU +*> \verbatim +*> KU is INTEGER +*> The number of superdiagonals within the band of A. KU >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX array, dimension (LDAB,N) +*> On entry, the matrix A in band storage, in rows 1 to KL+KU+1. +*> The j-th column of A is stored in the j-th column of the +*> array AB as follows: +*> AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl) +*> +*> If FACT = 'F' and EQUED is not 'N', then A must have been +*> equilibrated by the scaling factors in R and/or C. AB is not +*> modified if FACT = 'F' or 'N', or if FACT = 'E' and +*> EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] AFB +*> \verbatim +*> AFB is COMPLEX array, dimension (LDAFB,N) +*> If FACT = 'F', then AFB is an input argument and on entry +*> contains details of the LU factorization of the band matrix +*> A, as computed by CGBTRF. U is stored as an upper triangular +*> band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1, +*> and the multipliers used during the factorization are stored +*> in rows KL+KU+2 to 2*KL+KU+1. If EQUED .ne. 'N', then AFB is +*> the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AFB is an output argument and on exit +*> returns details of the LU factorization of A. +*> +*> If FACT = 'E', then AFB is an output argument and on exit +*> returns details of the LU factorization of the equilibrated +*> matrix A (see the description of AB for the form of the +*> equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAFB +*> \verbatim +*> LDAFB is INTEGER +*> The leading dimension of the array AFB. LDAFB >= 2*KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = L*U +*> as computed by CGBTRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is REAL array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,NRHS) +*> On entry, the right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is REAL +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is REAL array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is REAL array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (MAX(1,N)) +*> On exit, RWORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If RWORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 RWORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization +*> has been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGBsolve +* +* ===================================================================== + SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, + $ LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, + $ RCOND, FERR, BERR, WORK, RWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS + REAL RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL BERR( * ), C( * ), FERR( * ), R( * ), + $ RWORK( * ) + COMPLEX AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* Moved setting of INFO = N+1 so INFO does not subsequently get +* overwritten. Sven, 17 Mar 05. +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J, J1, J2 + REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + REAL CLANGB, CLANTB, SLAMCH + EXTERNAL LSAME, CLANGB, CLANTB, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CGBCON, CGBEQU, CGBRFS, CGBTRF, CGBTRS, + $ CLACPY, CLAQGB, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = SLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KL.LT.0 ) THEN + INFO = -4 + ELSE IF( KU.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDAB.LT.KL+KU+1 ) THEN + INFO = -8 + ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN + INFO = -10 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -12 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -13 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -14 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -16 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -18 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGBSVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL CGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL CLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of the band matrix A. +* + DO 70 J = 1, N + J1 = MAX( J-KU, 1 ) + J2 = MIN( J+KL, N ) + CALL CCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1, + $ AFB( KL+KU+1-J+J1, J ), 1 ) + 70 CONTINUE +* + CALL CGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + ANORM = ZERO + DO 90 J = 1, INFO + DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 ) + ANORM = MAX( ANORM, ABS( AB( I, J ) ) ) + 80 CONTINUE + 90 CONTINUE + RPVGRW = CLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ), + $ AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB, + $ RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ANORM / RPVGRW + END IF + RWORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = CLANGB( NORM, N, KL, KU, AB, LDAB, RWORK ) + RPVGRW = CLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = CLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL CGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND, + $ WORK, RWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL CGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX, + $ INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL CGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV, + $ B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 120 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 140 J = 1, NRHS + DO 130 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 130 CONTINUE + 140 CONTINUE + DO 150 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 150 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + RWORK( 1 ) = RPVGRW + RETURN +* +* End of CGBSVX +* + END diff --git a/lapack-netlib/cgejsv.f b/lapack-netlib/cgejsv.f new file mode 100644 index 000000000..51a6cee4e --- /dev/null +++ b/lapack-netlib/cgejsv.f @@ -0,0 +1,2232 @@ +*> \brief \b CGEJSV +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGEJSV + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, +* M, N, A, LDA, SVA, U, LDU, V, LDV, +* CWORK, LWORK, RWORK, LRWORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* INTEGER INFO, LDA, LDU, LDV, LWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK ) +* REAL SVA( N ), RWORK( LRWORK ) +* INTEGER IWORK( * ) +* CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGEJSV computes the singular value decomposition (SVD) of a complex M-by-N +*> matrix [A], where M >= N. The SVD of [A] is written as +*> +*> [A] = [U] * [SIGMA] * [V]^*, +*> +*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N +*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and +*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are +*> the singular values of [A]. The columns of [U] and [V] are the left and +*> the right singular vectors of [A], respectively. The matrices [U] and [V] +*> are computed and stored in the arrays U and V, respectively. The diagonal +*> of [SIGMA] is computed and stored in the array SVA. +*> \endverbatim +*> +*> Arguments: +*> ========== +*> +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy: +*> = 'C': This option works well (high relative accuracy) if A = B * D, +*> with well-conditioned B and arbitrary diagonal matrix D. +*> The accuracy cannot be spoiled by COLUMN scaling. The +*> accuracy of the computed output depends on the condition of +*> B, and the procedure aims at the best theoretical accuracy. +*> The relative error max_{i=1:N}|d sigma_i| / sigma_i is +*> bounded by f(M,N)*epsilon* cond(B), independent of D. +*> The input matrix is preprocessed with the QRF with column +*> pivoting. This initial preprocessing and preconditioning by +*> a rank revealing QR factorization is common for all values of +*> JOBA. Additional actions are specified as follows: +*> = 'E': Computation as with 'C' with an additional estimate of the +*> condition number of B. It provides a realistic error bound. +*> = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings +*> D1, D2, and well-conditioned matrix C, this option gives +*> higher accuracy than the 'C' option. If the structure of the +*> input matrix is not known, and relative accuracy is +*> desirable, then this option is advisable. The input matrix A +*> is preprocessed with QR factorization with FULL (row and +*> column) pivoting. +*> = 'G': Computation as with 'F' with an additional estimate of the +*> condition number of B, where A=B*D. If A has heavily weighted +*> rows, then using this condition number gives too pessimistic +*> error bound. +*> = 'A': Small singular values are not well determined by the data +*> and are considered as noisy; the matrix is treated as +*> numerically rank deficient. The error in the computed +*> singular values is bounded by f(m,n)*epsilon*||A||. +*> The computed SVD A = U * S * V^* restores A up to +*> f(m,n)*epsilon*||A||. +*> This gives the procedure the licence to discard (set to zero) +*> all singular values below N*epsilon*||A||. +*> = 'R': Similar as in 'A'. Rank revealing property of the initial +*> QR factorization is used do reveal (using triangular factor) +*> a gap sigma_{r+1} < epsilon * sigma_r in which case the +*> numerical RANK is declared to be r. The SVD is computed with +*> absolute error bounds, but more accurately than with 'A'. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> Specifies whether to compute the columns of U: +*> = 'U': N columns of U are returned in the array U. +*> = 'F': full set of M left sing. vectors is returned in the array U. +*> = 'W': U may be used as workspace of length M*N. See the description +*> of U. +*> = 'N': U is not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> Specifies whether to compute the matrix V: +*> = 'V': N columns of V are returned in the array V; Jacobi rotations +*> are not explicitly accumulated. +*> = 'J': N columns of V are returned in the array V, but they are +*> computed as the product of Jacobi rotations, if JOBT = 'N'. +*> = 'W': V may be used as workspace of length N*N. See the description +*> of V. +*> = 'N': V is not computed. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> Specifies the RANGE for the singular values. Issues the licence to +*> set to zero small positive singular values if they are outside +*> specified range. If A .NE. 0 is scaled so that the largest singular +*> value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues +*> the licence to kill columns of A whose norm in c*A is less than +*> SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, +*> where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E'). +*> = 'N': Do not kill small columns of c*A. This option assumes that +*> BLAS and QR factorizations and triangular solvers are +*> implemented to work in that range. If the condition of A +*> is greater than BIG, use CGESVJ. +*> = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)] +*> (roughly, as described above). This option is recommended. +*> =========================== +*> For computing the singular values in the FULL range [SFMIN,BIG] +*> use CGESVJ. +*> \endverbatim +*> +*> \param[in] JOBT +*> \verbatim +*> JOBT is CHARACTER*1 +*> If the matrix is square then the procedure may determine to use +*> transposed A if A^* seems to be better with respect to convergence. +*> If the matrix is not square, JOBT is ignored. +*> The decision is based on two values of entropy over the adjoint +*> orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7). +*> = 'T': transpose if entropy test indicates possibly faster +*> convergence of Jacobi process if A^* is taken as input. If A is +*> replaced with A^*, then the row pivoting is included automatically. +*> = 'N': do not speculate. +*> The option 'T' can be used to compute only the singular values, or +*> the full SVD (U, SIGMA and V). For only one set of singular vectors +*> (U or V), the caller should provide both U and V, as one of the +*> matrices is used as workspace if the matrix A is transposed. +*> The implementer can easily remove this constraint and make the +*> code more complicated. See the descriptions of U and V. +*> In general, this option is considered experimental, and 'N'; should +*> be preferred. This is subject to changes in the future. +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> Issues the licence to introduce structured perturbations to drown +*> denormalized numbers. This licence should be active if the +*> denormals are poorly implemented, causing slow computation, +*> especially in cases of fast convergence (!). For details see [1,2]. +*> For the sake of simplicity, this perturbations are included only +*> when the full SVD or only the singular values are requested. The +*> implementer/user can easily add the perturbation for the cases of +*> computing one set of singular vectors. +*> = 'P': introduce perturbation +*> = 'N': do not perturb +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] SVA +*> \verbatim +*> SVA is REAL array, dimension (N) +*> On exit, +*> - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During +*> the computation SVA contains Euclidean column norms of the +*> iterated matrices in the array A. +*> - For RWORK(1) .NE. RWORK(2): The singular values of A are +*> (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if +*> sigma_max(A) overflows or if small singular values have been +*> saved from underflow by scaling the input matrix A. +*> - If JOBR='R' then some of the singular values may be returned +*> as exact zeros obtained by "set to zero" because they are +*> below the numerical rank threshold or are denormalized numbers. +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is COMPLEX array, dimension ( LDU, N ) or ( LDU, M ) +*> If JOBU = 'U', then U contains on exit the M-by-N matrix of +*> the left singular vectors. +*> If JOBU = 'F', then U contains on exit the M-by-M matrix of +*> the left singular vectors, including an ONB +*> of the orthogonal complement of the Range(A). +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), +*> then U is used as workspace if the procedure +*> replaces A with A^*. In that case, [V] is computed +*> in U as left singular vectors of A^* and then +*> copied back to the V array. This 'W' option is just +*> a reminder to the caller that in this case U is +*> reserved as workspace of length N*N. +*> If JOBU = 'N' U is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER +*> The leading dimension of the array U, LDU >= 1. +*> IF JOBU = 'U' or 'F' or 'W', then LDU >= M. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is COMPLEX array, dimension ( LDV, N ) +*> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of +*> the right singular vectors; +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), +*> then V is used as workspace if the pprocedure +*> replaces A with A^*. In that case, [U] is computed +*> in V as right singular vectors of A^* and then +*> copied back to the U array. This 'W' option is just +*> a reminder to the caller that in this case V is +*> reserved as workspace of length N*N. +*> If JOBV = 'N' V is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V' or 'J' or 'W', then LDV >= N. +*> \endverbatim +*> +*> \param[out] CWORK +*> \verbatim +*> CWORK is COMPLEX array, dimension (MAX(2,LWORK)) +*> If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit CWORK(1) contains the required length of +*> CWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> Length of CWORK to confirm proper allocation of workspace. +*> LWORK depends on the job: +*> +*> 1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and +*> 1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'): +*> LWORK >= 2*N+1. This is the minimal requirement. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= N + (N+1)*NB. Here NB is the optimal +*> block size for CGEQP3 and CGEQRF. +*> In general, optimal LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ)). +*> 1.2. .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). In this case, LWORK the minimal +*> requirement is LWORK >= N*N + 2*N. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ), +*> N*N+LWORK(CPOCON)). +*> 2. If SIGMA and the right singular vectors are needed (JOBV = 'V'), +*> (JOBU = 'N') +*> 2.1 .. no scaled condition estimate requested (JOBE = 'N'): +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance, +*> LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF, +*> CUNMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3), N+LWORK(CGESVJ), +*> N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)). +*> 2.2 .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance, +*> LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF, +*> CUNMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3), LWORK(CPOCON), N+LWORK(CGESVJ), +*> N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)). +*> 3. If SIGMA and the left singular vectors are needed +*> 3.1 .. no scaled condition estimate requested (JOBE = 'N'): +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3), 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). +*> 3.2 .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CPOCON), +*> 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). +*> +*> 4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> 4.1. if JOBV = 'V' +*> the minimal requirement is LWORK >= 5*N+2*N*N. +*> 4.2. if JOBV = 'J' the minimal requirement is +*> LWORK >= 4*N+N*N. +*> In both cases, the allocated CWORK can accommodate blocked runs +*> of CGEQP3, CGEQRF, CGELQF, CUNMQR, CUNMLQ. +*> +*> If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the +*> minimal length of CWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (MAX(7,LRWORK)) +*> On exit, +*> RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1) +*> such that SCALE*SVA(1:N) are the computed singular values +*> of A. (See the description of SVA().) +*> RWORK(2) = See the description of RWORK(1). +*> RWORK(3) = SCONDA is an estimate for the condition number of +*> column equilibrated A. (If JOBA = 'E' or 'G') +*> SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). +*> It is computed using CPOCON. It holds +*> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +*> where R is the triangular factor from the QRF of A. +*> However, if R is truncated and the numerical rank is +*> determined to be strictly smaller than N, SCONDA is +*> returned as -1, thus indicating that the smallest +*> singular values might be lost. +*> +*> If full SVD is needed, the following two condition numbers are +*> useful for the analysis of the algorithm. They are provided for +*> a developer/implementer who is familiar with the details of +*> the method. +*> +*> RWORK(4) = an estimate of the scaled condition number of the +*> triangular factor in the first QR factorization. +*> RWORK(5) = an estimate of the scaled condition number of the +*> triangular factor in the second QR factorization. +*> The following two parameters are computed if JOBT = 'T'. +*> They are provided for a developer/implementer who is familiar +*> with the details of the method. +*> RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy +*> of diag(A^* * A) / Trace(A^* * A) taken as point in the +*> probability simplex. +*> RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).) +*> If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit RWORK(1) contains the required length of +*> RWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> Length of RWORK to confirm proper allocation of workspace. +*> LRWORK depends on the job: +*> +*> 1. If only the singular values are requested i.e. if +*> LSAME(JOBU,'N') .AND. LSAME(JOBV,'N') +*> then: +*> 1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then: LRWORK = max( 7, 2 * M ). +*> 1.2. Otherwise, LRWORK = max( 7, N ). +*> 2. If singular values with the right singular vectors are requested +*> i.e. if +*> (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND. +*> .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) +*> then: +*> 2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 2.2. Otherwise, LRWORK = max( 7, N ). +*> 3. If singular values with the left singular vectors are requested, i.e. if +*> (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND. +*> .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) +*> then: +*> 3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 3.2. Otherwise, LRWORK = max( 7, N ). +*> 4. If singular values with both the left and the right singular vectors +*> are requested, i.e. if +*> (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND. +*> (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) +*> then: +*> 4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 4.2. Otherwise, LRWORK = max( 7, N ). +*> +*> If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and +*> the length of RWORK is returned in RWORK(1). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, of dimension at least 4, that further depends +*> on the job: +*> +*> 1. If only the singular values are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 2. If the singular values and the right singular vectors are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 3. If the singular values and the left singular vectors are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 4. If the singular values with both the left and the right singular vectors +*> are requested, then: +*> 4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N. +*> +*> On exit, +*> IWORK(1) = the numerical rank determined after the initial +*> QR factorization with pivoting. See the descriptions +*> of JOBA and JOBR. +*> IWORK(2) = the number of the computed nonzero singular values +*> IWORK(3) = if nonzero, a warning message: +*> If IWORK(3) = 1 then some of the column norms of A +*> were denormalized floats. The requested high accuracy +*> is not warranted by the data. +*> IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to +*> do the job as specified by the JOB parameters. +*> If the call to CGEJSV is a workspace query (indicated by LWORK = -1 and +*> LRWORK = -1), then on exit IWORK(1) contains the required length of +*> IWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: CGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGEsing +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> CGEJSV implements a preconditioned Jacobi SVD algorithm. It uses CGEQP3, +*> CGEQRF, and CGELQF as preprocessors and preconditioners. Optionally, an +*> additional row pivoting can be used as a preprocessor, which in some +*> cases results in much higher accuracy. An example is matrix A with the +*> structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned +*> diagonal matrices and C is well-conditioned matrix. In that case, complete +*> pivoting in the first QR factorizations provides accuracy dependent on the +*> condition number of C, and independent of D1, D2. Such higher accuracy is +*> not completely understood theoretically, but it works well in practice. +*> Further, if A can be written as A = B*D, with well-conditioned B and some +*> diagonal D, then the high accuracy is guaranteed, both theoretically and +*> in software, independent of D. For more details see [1], [2]. +*> The computational range for the singular values can be the full range +*> ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS +*> & LAPACK routines called by CGEJSV are implemented to work in that range. +*> If that is not the case, then the restriction for safe computation with +*> the singular values in the range of normalized IEEE numbers is that the +*> spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not +*> overflow. This code (CGEJSV) is best used in this restricted range, +*> meaning that singular values of magnitude below ||A||_2 / SLAMCH('O') are +*> returned as zeros. See JOBR for details on this. +*> Further, this implementation is somewhat slower than the one described +*> in [1,2] due to replacement of some non-LAPACK components, and because +*> the choice of some tuning parameters in the iterative part (CGESVJ) is +*> left to the implementer on a particular machine. +*> The rank revealing QR factorization (in this code: CGEQP3) should be +*> implemented as in [3]. We have a new version of CGEQP3 under development +*> that is more robust than the current one in LAPACK, with a cleaner cut in +*> rank deficient cases. It will be available in the SIGMA library [4]. +*> If M is much larger than N, it is obvious that the initial QRF with +*> column pivoting can be preprocessed by the QRF without pivoting. That +*> well known trick is not used in CGEJSV because in some cases heavy row +*> weighting can be treated with complete pivoting. The overhead in cases +*> M much larger than N is then only due to pivoting, but the benefits in +*> terms of accuracy have prevailed. The implementer/user can incorporate +*> this extra QRF step easily. The implementer can also improve data movement +*> (matrix transpose, matrix copy, matrix transposed copy) - this +*> implementation of CGEJSV uses only the simplest, naive data movement. +*> \endverbatim +* +*> \par Contributor: +* ================== +*> +*> Zlatko Drmac (Zagreb, Croatia) +* +*> \par References: +* ================ +*> +*> \verbatim +*> +*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342. +*> LAPACK Working note 169. +*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362. +*> LAPACK Working note 170. +*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR +*> factorization software - a case study. +*> ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28. +*> LAPACK Working note 176. +*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV, +*> QSVD, (H,K)-SVD computations. +*> Department of Mathematics, University of Zagreb, 2008, 2016. +*> \endverbatim +* +*> \par Bugs, examples and comments: +* ================================= +*> +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> +* ===================================================================== + SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, + $ M, N, A, LDA, SVA, U, LDU, V, LDV, + $ CWORK, LWORK, RWORK, LRWORK, IWORK, INFO ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + IMPLICIT NONE + INTEGER INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK ) + REAL SVA( N ), RWORK( LRWORK ) + INTEGER IWORK( * ) + CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* =========================================================================== +* +* .. Local Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), CONE = ( 1.0E0, 0.0E0 ) ) +* .. +* .. Local Scalars .. + COMPLEX CTEMP + REAL AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, COND_OK, + $ CONDR1, CONDR2, ENTRA, ENTRAT, EPSLN, MAXPRJ, SCALEM, + $ SCONDA, SFMIN, SMALL, TEMP1, USCAL1, USCAL2, XSC + INTEGER IERR, N1, NR, NUMRANK, p, q, WARNING + LOGICAL ALMORT, DEFR, ERREST, GOSCAL, JRACC, KILL, LQUERY, + $ LSVEC, L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN, NOSCAL, + $ ROWPIV, RSVEC, TRANSP +* + INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK + INTEGER LWCON, LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM, + $ LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF + INTEGER LWRK_CGELQF, LWRK_CGEQP3, LWRK_CGEQP3N, LWRK_CGEQRF, + $ LWRK_CGESVJ, LWRK_CGESVJV, LWRK_CGESVJU, LWRK_CUNMLQ, + $ LWRK_CUNMQR, LWRK_CUNMQRM +* .. +* .. Local Arrays + COMPLEX CDUMMY(1) + REAL RDUMMY(1) +* +* .. Intrinsic Functions .. + INTRINSIC ABS, CMPLX, CONJG, ALOG, MAX, MIN, REAL, NINT, SQRT +* .. +* .. External Functions .. + REAL SLAMCH, SCNRM2 + INTEGER ISAMAX, ICAMAX + LOGICAL LSAME + EXTERNAL ISAMAX, ICAMAX, LSAME, SLAMCH, SCNRM2 +* .. +* .. External Subroutines .. + EXTERNAL SLASSQ, CCOPY, CGELQF, CGEQP3, CGEQRF, CLACPY, CLAPMR, + $ CLASCL, SLASCL, CLASET, CLASSQ, CLASWP, CUNGQR, CUNMLQ, + $ CUNMQR, CPOCON, SSCAL, CSSCAL, CSWAP, CTRSM, CLACGV, + $ XERBLA +* + EXTERNAL CGESVJ +* .. +* +* Test the input arguments +* + LSVEC = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' ) + JRACC = LSAME( JOBV, 'J' ) + RSVEC = LSAME( JOBV, 'V' ) .OR. JRACC + ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' ) + L2RANK = LSAME( JOBA, 'R' ) + L2ABER = LSAME( JOBA, 'A' ) + ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' ) + L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N ) + L2KILL = LSAME( JOBR, 'R' ) + DEFR = LSAME( JOBR, 'N' ) + L2PERT = LSAME( JOBP, 'P' ) +* + LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) +* + IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR. + $ ERREST .OR. LSAME( JOBA, 'C' ) )) THEN + INFO = - 1 + ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR. + $ ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN + INFO = - 2 + ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR. + $ ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN + INFO = - 3 + ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) ) THEN + INFO = - 4 + ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN + INFO = - 5 + ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = - 6 + ELSE IF ( M .LT. 0 ) THEN + INFO = - 7 + ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN + INFO = - 8 + ELSE IF ( LDA .LT. M ) THEN + INFO = - 10 + ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN + INFO = - 13 + ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN + INFO = - 15 + ELSE +* #:) + INFO = 0 + END IF +* + IF ( INFO .EQ. 0 ) THEN +* .. compute the minimal and the optimal workspace lengths +* [[The expressions for computing the minimal and the optimal +* values of LCWORK, LRWORK are written with a lot of redundancy and +* can be simplified. However, this verbose form is useful for +* maintenance and modifications of the code.]] +* +* .. minimal workspace length for CGEQP3 of an M x N matrix, +* CGEQRF of an N x N matrix, CGELQF of an N x N matrix, +* CUNMLQ for computing N x N matrix, CUNMQR for computing N x N +* matrix, CUNMQR for computing M x N matrix, respectively. + LWQP3 = N+1 + LWQRF = MAX( 1, N ) + LWLQF = MAX( 1, N ) + LWUNMLQ = MAX( 1, N ) + LWUNMQR = MAX( 1, N ) + LWUNMQRM = MAX( 1, M ) +* .. minimal workspace length for CPOCON of an N x N matrix + LWCON = 2 * N +* .. minimal workspace length for CGESVJ of an N x N matrix, +* without and with explicit accumulation of Jacobi rotations + LWSVDJ = MAX( 2 * N, 1 ) + LWSVDJV = MAX( 2 * N, 1 ) +* .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ + LRWQP3 = 2 * N + LRWCON = N + LRWSVDJ = N + IF ( LQUERY ) THEN + CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_CGEQP3 = INT( CDUMMY(1) ) + CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) + LWRK_CGEQRF = INT( CDUMMY(1) ) + CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) + LWRK_CGELQF = INT( CDUMMY(1) ) + END IF + MINWRK = 2 + OPTWRK = 2 + MINIWRK = N + IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN +* .. minimal and optimal sizes of the complex workspace if +* only the singular values are requested + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ ) + ELSE + MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ ) + END IF + IF ( LQUERY ) THEN + CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, + $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, + $ N+LWRK_CGEQRF, LWRK_CGESVJ ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, N+LWRK_CGEQRF, + $ LWRK_CGESVJ ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWCON, LRWSVDJ ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the right singular vectors are requested + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF, + $ 2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ ) + ELSE + MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, + $ N+LWSVDJ, N+LWUNMLQ ) + END IF + IF ( LQUERY ) THEN + CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, + $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJ = INT( CDUMMY(1) ) + CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, + $ N+LWRK_CGELQF, 2*N+LWRK_CGEQRF, + $ N+LWRK_CGESVJ, N+LWRK_CUNMLQ ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, LWRK_CGESVJ,N+LWRK_CGELQF, + $ 2*N+LWRK_CGEQRF, N+LWRK_CGESVJ, + $ N+LWRK_CUNMLQ ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the left singular vectors are requested + IF ( ERREST ) THEN + MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM ) + ELSE + MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM ) + END IF + IF ( LQUERY ) THEN + CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, + $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJ = INT( CDUMMY(1) ) + CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF, + $ LWRK_CGESVJ, LWRK_CUNMQRM ) + ELSE + OPTWRK = N + MAX( LWRK_CGEQP3, N+LWRK_CGEQRF, + $ LWRK_CGESVJ, LWRK_CUNMQRM ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE +* .. minimal and optimal sizes of the complex workspace if the +* full SVD is requested + IF ( .NOT. JRACC ) THEN + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+N**2+LWCON, + $ 2*N+LWQRF, 2*N+LWQP3, + $ 2*N+N**2+N+LWLQF, 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, + $ 2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, + $ N+N**2+LWSVDJ, N+LWUNMQRM ) + ELSE + MINWRK = MAX( N+LWQP3, 2*N+N**2+LWCON, + $ 2*N+LWQRF, 2*N+LWQP3, + $ 2*N+N**2+N+LWLQF, 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, + $ 2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, + $ N+N**2+LWSVDJ, N+LWUNMQRM ) + END IF + MINIWRK = MINIWRK + N + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, + $ 2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, + $ N+LWUNMQRM ) + ELSE + MINWRK = MAX( N+LWQP3, 2*N+LWQRF, + $ 2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, + $ N+LWUNMQRM ) + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + END IF + IF ( LQUERY ) THEN + CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) + CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQR = INT( CDUMMY(1) ) + IF ( .NOT. JRACC ) THEN + CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_CGEQP3N = INT( CDUMMY(1) ) + CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJ = INT( CDUMMY(1) ) + CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJU = INT( CDUMMY(1) ) + CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJV = INT( CDUMMY(1) ) + CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, + $ 2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, + $ 2*N+LWRK_CGEQP3N, + $ 2*N+N**2+N+LWRK_CGELQF, + $ 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWRK_CGESVJ, + $ 2*N+N**2+N+LWRK_CGESVJV, + $ 2*N+N**2+N+LWRK_CUNMQR, + $ 2*N+N**2+N+LWRK_CUNMLQ, + $ N+N**2+LWRK_CGESVJU, + $ N+LWRK_CUNMQRM ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, + $ 2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, + $ 2*N+LWRK_CGEQP3N, + $ 2*N+N**2+N+LWRK_CGELQF, + $ 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWRK_CGESVJ, + $ 2*N+N**2+N+LWRK_CGESVJV, + $ 2*N+N**2+N+LWRK_CUNMQR, + $ 2*N+N**2+N+LWRK_CUNMLQ, + $ N+N**2+LWRK_CGESVJU, + $ N+LWRK_CUNMQRM ) + END IF + ELSE + CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJV = INT( CDUMMY(1) ) + CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_CUNMQR = INT( CDUMMY(1) ) + CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, + $ 2*N+LWRK_CGEQRF, 2*N+N**2, + $ 2*N+N**2+LWRK_CGESVJV, + $ 2*N+N**2+N+LWRK_CUNMQR,N+LWRK_CUNMQRM ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, 2*N+LWRK_CGEQRF, + $ 2*N+N**2, 2*N+N**2+LWRK_CGESVJV, + $ 2*N+N**2+N+LWRK_CUNMQR, + $ N+LWRK_CUNMQRM ) + END IF + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + END IF + END IF + MINWRK = MAX( 2, MINWRK ) + OPTWRK = MAX( OPTWRK, MINWRK ) + IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 + IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 + END IF +* + IF ( INFO .NE. 0 ) THEN +* #:( + CALL XERBLA( 'CGEJSV', - INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + CWORK(1) = OPTWRK + CWORK(2) = MINWRK + RWORK(1) = MINRWRK + IWORK(1) = MAX( 4, MINIWRK ) + RETURN + END IF +* +* Quick return for void matrix (Y3K safe) +* #:) + IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN + IWORK(1:4) = 0 + RWORK(1:7) = 0 + RETURN + ENDIF +* +* Determine whether the matrix U should be M x N or M x M +* + IF ( LSVEC ) THEN + N1 = N + IF ( LSAME( JOBU, 'F' ) ) N1 = M + END IF +* +* Set numerical parameters +* +*! NOTE: Make sure SLAMCH() does not fail on the target architecture. +* + EPSLN = SLAMCH('Epsilon') + SFMIN = SLAMCH('SafeMinimum') + SMALL = SFMIN / EPSLN + BIG = SLAMCH('O') +* BIG = ONE / SFMIN +* +* Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N +* +*(!) If necessary, scale SVA() to protect the largest norm from +* overflow. It is possible that this scaling pushes the smallest +* column norm left from the underflow threshold (extreme case). +* + SCALEM = ONE / SQRT(REAL(M)*REAL(N)) + NOSCAL = .TRUE. + GOSCAL = .TRUE. + DO 1874 p = 1, N + AAPP = ZERO + AAQQ = ONE + CALL CLASSQ( M, A(1,p), 1, AAPP, AAQQ ) + IF ( AAPP .GT. BIG ) THEN + INFO = - 9 + CALL XERBLA( 'CGEJSV', -INFO ) + RETURN + END IF + AAQQ = SQRT(AAQQ) + IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL ) THEN + SVA(p) = AAPP * AAQQ + ELSE + NOSCAL = .FALSE. + SVA(p) = AAPP * ( AAQQ * SCALEM ) + IF ( GOSCAL ) THEN + GOSCAL = .FALSE. + CALL SSCAL( p-1, SCALEM, SVA, 1 ) + END IF + END IF + 1874 CONTINUE +* + IF ( NOSCAL ) SCALEM = ONE +* + AAPP = ZERO + AAQQ = BIG + DO 4781 p = 1, N + AAPP = MAX( AAPP, SVA(p) ) + IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) ) + 4781 CONTINUE +* +* Quick return for zero M x N matrix +* #:) + IF ( AAPP .EQ. ZERO ) THEN + IF ( LSVEC ) CALL CLASET( 'G', M, N1, CZERO, CONE, U, LDU ) + IF ( RSVEC ) CALL CLASET( 'G', N, N, CZERO, CONE, V, LDV ) + RWORK(1) = ONE + RWORK(2) = ONE + IF ( ERREST ) RWORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = ONE + RWORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ZERO + RWORK(7) = ZERO + END IF + IWORK(1) = 0 + IWORK(2) = 0 + IWORK(3) = 0 + IWORK(4) = -1 + RETURN + END IF +* +* Issue warning if denormalized column norms detected. Override the +* high relative accuracy request. Issue licence to kill nonzero columns +* (set them to zero) whose norm is less than sigma_max / BIG (roughly). +* #:( + WARNING = 0 + IF ( AAQQ .LE. SFMIN ) THEN + L2RANK = .TRUE. + L2KILL = .TRUE. + WARNING = 1 + END IF +* +* Quick return for one-column matrix +* #:) + IF ( N .EQ. 1 ) THEN +* + IF ( LSVEC ) THEN + CALL CLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR ) + CALL CLACPY( 'A', M, 1, A, LDA, U, LDU ) +* computing all M left singular vectors of the M x 1 matrix + IF ( N1 .NE. N ) THEN + CALL CGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR ) + CALL CUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR ) + CALL CCOPY( M, A(1,1), 1, U(1,1), 1 ) + END IF + END IF + IF ( RSVEC ) THEN + V(1,1) = CONE + END IF + IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN + SVA(1) = SVA(1) / SCALEM + SCALEM = ONE + END IF + RWORK(1) = ONE / SCALEM + RWORK(2) = ONE + IF ( SVA(1) .NE. ZERO ) THEN + IWORK(1) = 1 + IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN + IWORK(2) = 1 + ELSE + IWORK(2) = 0 + END IF + ELSE + IWORK(1) = 0 + IWORK(2) = 0 + END IF + IWORK(3) = 0 + IWORK(4) = -1 + IF ( ERREST ) RWORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = ONE + RWORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ZERO + RWORK(7) = ZERO + END IF + RETURN +* + END IF +* + TRANSP = .FALSE. +* + AATMAX = -ONE + AATMIN = BIG + IF ( ROWPIV .OR. L2TRAN ) THEN +* +* Compute the row norms, needed to determine row pivoting sequence +* (in the case of heavily row weighted A, row pivoting is strongly +* advised) and to collect information needed to compare the +* structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.). +* + IF ( L2TRAN ) THEN + DO 1950 p = 1, M + XSC = ZERO + TEMP1 = ONE + CALL CLASSQ( N, A(p,1), LDA, XSC, TEMP1 ) +* CLASSQ gets both the ell_2 and the ell_infinity norm +* in one pass through the vector + RWORK(M+p) = XSC * SCALEM + RWORK(p) = XSC * (SCALEM*SQRT(TEMP1)) + AATMAX = MAX( AATMAX, RWORK(p) ) + IF (RWORK(p) .NE. ZERO) + $ AATMIN = MIN(AATMIN,RWORK(p)) + 1950 CONTINUE + ELSE + DO 1904 p = 1, M + RWORK(M+p) = SCALEM*ABS( A(p,ICAMAX(N,A(p,1),LDA)) ) + AATMAX = MAX( AATMAX, RWORK(M+p) ) + AATMIN = MIN( AATMIN, RWORK(M+p) ) + 1904 CONTINUE + END IF +* + END IF +* +* For square matrix A try to determine whether A^* would be better +* input for the preconditioned Jacobi SVD, with faster convergence. +* The decision is based on an O(N) function of the vector of column +* and row norms of A, based on the Shannon entropy. This should give +* the right choice in most cases when the difference actually matters. +* It may fail and pick the slower converging side. +* + ENTRA = ZERO + ENTRAT = ZERO + IF ( L2TRAN ) THEN +* + XSC = ZERO + TEMP1 = ONE + CALL SLASSQ( N, SVA, 1, XSC, TEMP1 ) + TEMP1 = ONE / TEMP1 +* + ENTRA = ZERO + DO 1113 p = 1, N + BIG1 = ( ( SVA(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * ALOG(BIG1) + 1113 CONTINUE + ENTRA = - ENTRA / ALOG(REAL(N)) +* +* Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex. +* It is derived from the diagonal of A^* * A. Do the same with the +* diagonal of A * A^*, compute the entropy of the corresponding +* probability distribution. Note that A * A^* and A^* * A have the +* same trace. +* + ENTRAT = ZERO + DO 1114 p = 1, M + BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * ALOG(BIG1) + 1114 CONTINUE + ENTRAT = - ENTRAT / ALOG(REAL(M)) +* +* Analyze the entropies and decide A or A^*. Smaller entropy +* usually means better input for the algorithm. +* + TRANSP = ( ENTRAT .LT. ENTRA ) +* +* If A^* is better than A, take the adjoint of A. This is allowed +* only for square matrices, M=N. + IF ( TRANSP ) THEN +* In an optimal implementation, this trivial transpose +* should be replaced with faster transpose. + DO 1115 p = 1, N - 1 + A(p,p) = CONJG(A(p,p)) + DO 1116 q = p + 1, N + CTEMP = CONJG(A(q,p)) + A(q,p) = CONJG(A(p,q)) + A(p,q) = CTEMP + 1116 CONTINUE + 1115 CONTINUE + A(N,N) = CONJG(A(N,N)) + DO 1117 p = 1, N + RWORK(M+p) = SVA(p) + SVA(p) = RWORK(p) +* previously computed row 2-norms are now column 2-norms +* of the transposed matrix + 1117 CONTINUE + TEMP1 = AAPP + AAPP = AATMAX + AATMAX = TEMP1 + TEMP1 = AAQQ + AAQQ = AATMIN + AATMIN = TEMP1 + KILL = LSVEC + LSVEC = RSVEC + RSVEC = KILL + IF ( LSVEC ) N1 = N +* + ROWPIV = .TRUE. + END IF +* + END IF +* END IF L2TRAN +* +* Scale the matrix so that its maximal singular value remains less +* than SQRT(BIG) -- the matrix is scaled so that its maximal column +* has Euclidean norm equal to SQRT(BIG/N). The only reason to keep +* SQRT(BIG) instead of BIG is the fact that CGEJSV uses LAPACK and +* BLAS routines that, in some implementations, are not capable of +* working in the full interval [SFMIN,BIG] and that they may provoke +* overflows in the intermediate results. If the singular values spread +* from SFMIN to BIG, then CGESVJ will compute them. So, in that case, +* one should use CGESVJ instead of CGEJSV. + BIG1 = SQRT( BIG ) + TEMP1 = SQRT( BIG / REAL(N) ) +* >> for future updates: allow bigger range, i.e. the largest column +* will be allowed up to BIG/N and CGESVJ will do the rest. However, for +* this all other (LAPACK) components must allow such a range. +* TEMP1 = BIG/REAL(N) +* TEMP1 = BIG * EPSLN this should 'almost' work with current LAPACK components + CALL SLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR ) + IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN + AAQQ = ( AAQQ / AAPP ) * TEMP1 + ELSE + AAQQ = ( AAQQ * TEMP1 ) / AAPP + END IF + TEMP1 = TEMP1 * SCALEM + CALL CLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR ) +* +* To undo scaling at the end of this procedure, multiply the +* computed singular values with USCAL2 / USCAL1. +* + USCAL1 = TEMP1 + USCAL2 = AAPP +* + IF ( L2KILL ) THEN +* L2KILL enforces computation of nonzero singular values in +* the restricted range of condition number of the initial A, +* sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN). + XSC = SQRT( SFMIN ) + ELSE + XSC = SMALL +* +* Now, if the condition number of A is too big, +* sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN, +* as a precaution measure, the full SVD is computed using CGESVJ +* with accumulated Jacobi rotations. This provides numerically +* more robust computation, at the cost of slightly increased run +* time. Depending on the concrete implementation of BLAS and LAPACK +* (i.e. how they behave in presence of extreme ill-conditioning) the +* implementor may decide to remove this switch. + IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN + JRACC = .TRUE. + END IF +* + END IF + IF ( AAQQ .LT. XSC ) THEN + DO 700 p = 1, N + IF ( SVA(p) .LT. XSC ) THEN + CALL CLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA ) + SVA(p) = ZERO + END IF + 700 CONTINUE + END IF +* +* Preconditioning using QR factorization with pivoting +* + IF ( ROWPIV ) THEN +* Optional row permutation (Bjoerck row pivoting): +* A result by Cox and Higham shows that the Bjoerck's +* row pivoting combined with standard column pivoting +* has similar effect as Powell-Reid complete pivoting. +* The ell-infinity norms of A are made nonincreasing. + IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN + IWOFF = 2*N + ELSE + IWOFF = N + END IF + DO 1952 p = 1, M - 1 + q = ISAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1 + IWORK(IWOFF+p) = q + IF ( p .NE. q ) THEN + TEMP1 = RWORK(M+p) + RWORK(M+p) = RWORK(M+q) + RWORK(M+q) = TEMP1 + END IF + 1952 CONTINUE + CALL CLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 ) + END IF +* +* End of the preparation phase (scaling, optional sorting and +* transposing, optional flushing of small columns). +* +* Preconditioning +* +* If the full SVD is needed, the right singular vectors are computed +* from a matrix equation, and for that we need theoretical analysis +* of the Businger-Golub pivoting. So we use CGEQP3 as the first RR QRF. +* In all other cases the first RR QRF can be chosen by other criteria +* (eg speed by replacing global with restricted window pivoting, such +* as in xGEQPX from TOMS # 782). Good results will be obtained using +* xGEQPX with properly (!) chosen numerical parameters. +* Any improvement of CGEQP3 improves overall performance of CGEJSV. +* +* A * P1 = Q1 * [ R1^* 0]^*: + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL CGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N, + $ RWORK, IERR ) +* +* The upper triangular matrix R1 from the first QRF is inspected for +* rank deficiency and possibilities for deflation, or possible +* ill-conditioning. Depending on the user specified flag L2RANK, +* the procedure explores possibilities to reduce the numerical +* rank by inspecting the computed upper triangular factor. If +* L2RANK or L2ABER are up, then CGEJSV will compute the SVD of +* A + dA, where ||dA|| <= f(M,N)*EPSLN. +* + NR = 1 + IF ( L2ABER ) THEN +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPSLN*||A|| are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPSLN*||A||. + TEMP1 = SQRT(REAL(N))*EPSLN + DO 3001 p = 2, N + IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN + NR = NR + 1 + ELSE + GO TO 3002 + END IF + 3001 CONTINUE + 3002 CONTINUE + ELSE IF ( L2RANK ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R1 is used as the criterion for +* close-to-rank-deficient. + TEMP1 = SQRT(SFMIN) + DO 3401 p = 2, N + IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. + $ ( ABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* The goal is high relative accuracy. However, if the matrix +* has high scaled condition number the relative accuracy is in +* general not feasible. Later on, a condition number estimator +* will be deployed to estimate the scaled condition number. +* Here we just remove the underflowed part of the triangular +* factor. This prevents the situation in which the code is +* working hard to get the accuracy not warranted by the data. + TEMP1 = SQRT(SFMIN) + DO 3301 p = 2, N + IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302 + NR = NR + 1 + 3301 CONTINUE + 3302 CONTINUE +* + END IF +* + ALMORT = .FALSE. + IF ( NR .EQ. N ) THEN + MAXPRJ = ONE + DO 3051 p = 2, N + TEMP1 = ABS(A(p,p)) / SVA(IWORK(p)) + MAXPRJ = MIN( MAXPRJ, TEMP1 ) + 3051 CONTINUE + IF ( MAXPRJ**2 .GE. ONE - REAL(N)*EPSLN ) ALMORT = .TRUE. + END IF +* +* + SCONDA = - ONE + CONDR1 = - ONE + CONDR2 = - ONE +* + IF ( ERREST ) THEN + IF ( N .EQ. NR ) THEN + IF ( RSVEC ) THEN +* .. V is available as workspace + CALL CLACPY( 'U', N, N, A, LDA, V, LDV ) + DO 3053 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL CSSCAL( p, ONE/TEMP1, V(1,p), 1 ) + 3053 CONTINUE + IF ( LSVEC )THEN + CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ CWORK(N+1), RWORK, IERR ) + ELSE + CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ CWORK, RWORK, IERR ) + END IF +* + ELSE IF ( LSVEC ) THEN +* .. U is available as workspace + CALL CLACPY( 'U', N, N, A, LDA, U, LDU ) + DO 3054 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL CSSCAL( p, ONE/TEMP1, U(1,p), 1 ) + 3054 CONTINUE + CALL CPOCON( 'U', N, U, LDU, ONE, TEMP1, + $ CWORK(N+1), RWORK, IERR ) + ELSE + CALL CLACPY( 'U', N, N, A, LDA, CWORK, N ) +*[] CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N ) +* Change: here index shifted by N to the left, CWORK(1:N) +* not needed for SIGMA only computation + DO 3052 p = 1, N + TEMP1 = SVA(IWORK(p)) +*[] CALL CSSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 ) + CALL CSSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 ) + 3052 CONTINUE +* .. the columns of R are scaled to have unit Euclidean lengths. +*[] CALL CPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1, +*[] $ CWORK(N+N*N+1), RWORK, IERR ) + CALL CPOCON( 'U', N, CWORK, N, ONE, TEMP1, + $ CWORK(N*N+1), RWORK, IERR ) +* + END IF + IF ( TEMP1 .NE. ZERO ) THEN + SCONDA = ONE / SQRT(TEMP1) + ELSE + SCONDA = - ONE + END IF +* SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA + ELSE + SCONDA = - ONE + END IF + END IF +* + L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) ) +* If there is no violent scaling, artificial perturbation is not needed. +* +* Phase 3: +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +* +* Singular Values only +* +* .. transpose A(1:NR,1:N) + DO 1946 p = 1, MIN( N-1, NR ) + CALL CCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 ) + CALL CLACGV( N-p+1, A(p,p), 1 ) + 1946 CONTINUE + IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N)) +* +* The following two DO-loops introduce small relative perturbation +* into the strict upper triangle of the lower triangular matrix. +* Small entries below the main diagonal are also changed. +* This modification is useful if the computing environment does not +* provide/allow FLUSH TO ZERO underflow, for it prevents many +* annoying denormalized numbers in case of strongly scaled matrices. +* The perturbation is structured so that it does not introduce any +* new perturbation of the singular values, and it does not destroy +* the job done by the preconditioner. +* The licence for this perturbation is in the variable L2PERT, which +* should be .FALSE. if FLUSH TO ZERO underflow is active. +* + IF ( .NOT. ALMORT ) THEN +* + IF ( L2PERT ) THEN +* XSC = SQRT(SMALL) + XSC = EPSLN / REAL(N) + DO 4947 q = 1, NR + CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO) + DO 4949 p = 1, N + IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) +* $ A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) ) + $ A(p,q) = CTEMP + 4949 CONTINUE + 4947 CONTINUE + ELSE + CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA ) + END IF +* +* .. second preconditioning using the QR factorization +* + CALL CGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR ) +* +* .. and transpose upper to lower triangular + DO 1948 p = 1, NR - 1 + CALL CCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 ) + CALL CLACGV( NR-p+1, A(p,p), 1 ) + 1948 CONTINUE +* + END IF +* +* Row-cyclic Jacobi SVD algorithm with column pivoting +* +* .. again some perturbation (a "background noise") is added +* to drown denormals + IF ( L2PERT ) THEN +* XSC = SQRT(SMALL) + XSC = EPSLN / REAL(N) + DO 1947 q = 1, NR + CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO) + DO 1949 p = 1, NR + IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) +* $ A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) ) + $ A(p,q) = CTEMP + 1949 CONTINUE + 1947 CONTINUE + ELSE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA ) + END IF +* +* .. and one-sided Jacobi rotations are started on a lower +* triangular matrix (plus perturbation which is ignored in +* the part which destroys triangular form (confusing?!)) +* + CALL CGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA, + $ N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO ) +* + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) +* +* + ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) ) + $ .OR. + $ ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN +* +* -> Singular Values and Right Singular Vectors <- +* + IF ( ALMORT ) THEN +* +* .. in this case NR equals N + DO 1998 p = 1, NR + CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL CLACGV( N-p+1, V(p,p), 1 ) + 1998 CONTINUE + CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV ) +* + CALL CGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA, + $ CWORK, LWORK, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + + ELSE +* +* .. two more QR factorizations ( one QRF is not enough, two require +* accumulated product of Jacobi rotations, three are perfect ) +* + CALL CLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA ) + CALL CGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR) + CALL CLACPY( 'L', NR, NR, A, LDA, V, LDV ) + CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV ) + CALL CGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) + DO 8998 p = 1, NR + CALL CCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 ) + CALL CLACGV( NR-p+1, V(p,p), 1 ) + 8998 CONTINUE + CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV) +* + CALL CGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U, + $ LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + IF ( NR .LT. N ) THEN + CALL CLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1), LDV ) + CALL CLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1), LDV ) + CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV ) + END IF +* + CALL CUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK, + $ V, LDV, CWORK(N+1), LWORK-N, IERR ) +* + END IF +* .. permute the rows of V +* DO 8991 p = 1, N +* CALL CCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA ) +* 8991 CONTINUE +* CALL CLACPY( 'All', N, N, A, LDA, V, LDV ) + CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK ) +* + IF ( TRANSP ) THEN + CALL CLACPY( 'A', N, N, V, LDV, U, LDU ) + END IF +* + ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN +* + CALL CLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA ) +* + CALL CGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV, + $ CWORK, LWORK, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK ) +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN +* +* .. Singular Values and Left Singular Vectors .. +* +* .. second preconditioning step to avoid need to accumulate +* Jacobi rotations in the Jacobi iterations. + DO 1965 p = 1, NR + CALL CCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 ) + CALL CLACGV( N-p+1, U(p,p), 1 ) + 1965 CONTINUE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) +* + CALL CGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) +* + DO 1967 p = 1, NR - 1 + CALL CCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 ) + CALL CLACGV( N-p+1, U(p,p), 1 ) + 1967 CONTINUE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) +* + CALL CGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A, + $ LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) +* + IF ( NR .LT. M ) THEN + CALL CLASET( 'A', M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL CLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU ) + CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU ) + END IF + END IF +* + CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + DO 1974 p = 1, N1 + XSC = ONE / SCNRM2( M, U(1,p), 1 ) + CALL CSSCAL( M, XSC, U(1,p), 1 ) + 1974 CONTINUE +* + IF ( TRANSP ) THEN + CALL CLACPY( 'A', N, N, U, LDU, V, LDV ) + END IF +* + ELSE +* +* .. Full SVD .. +* + IF ( .NOT. JRACC ) THEN +* + IF ( .NOT. ALMORT ) THEN +* +* Second Preconditioning Step (QRF [with pivoting]) +* Note that the composition of TRANSPOSE, QRF and TRANSPOSE is +* equivalent to an LQF CALL. Since in many libraries the QRF +* seems to be better optimized than the LQF, we do explicit +* transpose and use the QRF. This is subject to changes in an +* optimized implementation of CGEJSV. +* + DO 1968 p = 1, NR + CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL CLACGV( N-p+1, V(p,p), 1 ) + 1968 CONTINUE +* +* .. the following two loops perturb small entries to avoid +* denormals in the second QR factorization, where they are +* as good as zeros. This is done to avoid painfully slow +* computation with denormals. The relative size of the perturbation +* is a parameter that can be changed by the implementer. +* This perturbation device will be obsolete on machines with +* properly implemented arithmetic. +* To switch it off, set L2PERT=.FALSE. To remove it from the +* code, remove the action under L2PERT=.TRUE., leave the ELSE part. +* The following two loops should be blocked and fused with the +* transposed copy above. +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 2969 q = 1, NR + CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO) + DO 2968 p = 1, N + IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) +* $ V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) ) + $ V(p,q) = CTEMP + IF ( p .LT. q ) V(p,q) = - V(p,q) + 2968 CONTINUE + 2969 CONTINUE + ELSE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV ) + END IF +* +* Estimate the row scaled condition number of R1 +* (If R1 is rectangular, N > NR, then the condition number +* of the leading NR x NR submatrix is estimated.) +* + CALL CLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR ) + DO 3950 p = 1, NR + TEMP1 = SCNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1) + CALL CSSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1) + 3950 CONTINUE + CALL CPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1, + $ CWORK(2*N+NR*NR+1),RWORK,IERR) + CONDR1 = ONE / SQRT(TEMP1) +* .. here need a second opinion on the condition number +* .. then assume worst case scenario +* R1 is OK for inverse <=> CONDR1 .LT. REAL(N) +* more conservative <=> CONDR1 .LT. SQRT(REAL(N)) +* + COND_OK = SQRT(SQRT(REAL(NR))) +*[TP] COND_OK is a tuning parameter. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* .. the second QRF without pivoting. Note: in an optimized +* implementation, this QRF should be implemented as the QRF +* of a lower triangular matrix. +* R1^* = Q2 * R2 + CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL)/EPSLN + DO 3959 p = 2, NR + DO 3958 q = 1, p - 1 + CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) + IF ( ABS(V(q,p)) .LE. TEMP1 ) +* $ V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) ) + $ V(q,p) = CTEMP + 3958 CONTINUE + 3959 CONTINUE + END IF +* + IF ( NR .NE. N ) + $ CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N ) +* .. save ... +* +* .. this transposed copy should be better than naive + DO 1969 p = 1, NR - 1 + CALL CCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 ) + CALL CLACGV(NR-p+1, V(p,p), 1 ) + 1969 CONTINUE + V(NR,NR)=CONJG(V(NR,NR)) +* + CONDR2 = CONDR1 +* + ELSE +* +* .. ill-conditioned case: second QRF with pivoting +* Note that windowed pivoting would be equally good +* numerically, and more run-time efficient. So, in +* an optimal implementation, the next call to CGEQP3 +* should be replaced with eg. CALL CGEQPX (ACM TOMS #782) +* with properly (carefully) chosen parameters. +* +* R1^* * P2 = Q2 * R2 + DO 3003 p = 1, NR + IWORK(N+p) = 0 + 3003 CONTINUE + CALL CGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1), + $ CWORK(2*N+1), LWORK-2*N, RWORK, IERR ) +** CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), +** $ LWORK-2*N, IERR ) + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 3969 p = 2, NR + DO 3968 q = 1, p - 1 + CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) + IF ( ABS(V(q,p)) .LE. TEMP1 ) +* $ V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) ) + $ V(q,p) = CTEMP + 3968 CONTINUE + 3969 CONTINUE + END IF +* + CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N ) +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 8970 p = 2, NR + DO 8971 q = 1, p - 1 + CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) +* V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) ) + V(p,q) = - CTEMP + 8971 CONTINUE + 8970 CONTINUE + ELSE + CALL CLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV ) + END IF +* Now, compute R2 = L3 * Q3, the LQ factorization. + CALL CGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1), + $ CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR ) +* .. and estimate the condition number + CALL CLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR ) + DO 4950 p = 1, NR + TEMP1 = SCNRM2( p, CWORK(2*N+N*NR+NR+p), NR ) + CALL CSSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR ) + 4950 CONTINUE + CALL CPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1, + $ CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR ) + CONDR2 = ONE / SQRT(TEMP1) +* +* + IF ( CONDR2 .GE. COND_OK ) THEN +* .. save the Householder vectors used for Q3 +* (this overwrites the copy of R2, as it will not be +* needed in this branch, but it does not overwritte the +* Huseholder vectors of Q2.). + CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) +* .. and the rest of the information on Q3 is in +* WORK(2*N+N*NR+1:2*N+N*NR+N) + END IF +* + END IF +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 4968 q = 2, NR + CTEMP = XSC * V(q,q) + DO 4969 p = 1, q - 1 +* V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) ) + V(p,q) = - CTEMP + 4969 CONTINUE + 4968 CONTINUE + ELSE + CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV ) + END IF +* +* Second preconditioning finished; continue with Jacobi SVD +* The input matrix is lower trinagular. +* +* Recover the right singular vectors as solution of a well +* conditioned triangular matrix equation. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* + CALL CGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU, + $ CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK, + $ LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 3970 p = 1, NR + CALL CCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL CSSCAL( NR, SVA(p), V(1,p), 1 ) + 3970 CONTINUE + +* .. pick the right matrix equation and solve it +* + IF ( NR .EQ. N ) THEN +* :)) .. best case, R1 is inverted. The solution of this matrix +* equation is Q2*V2 = the product of the Jacobi rotations +* used in CGESVJ, premultiplied with the orthogonal matrix +* from the second QR factorization. + CALL CTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV) + ELSE +* .. R1 is well conditioned, but non-square. Adjoint of R2 +* is inverted to get the product of the Jacobi rotations +* used in CGESVJ. The Q-factor from the second QR +* factorization is then built in explicitly. + CALL CTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1), + $ N,V,LDV) + IF ( NR .LT. N ) THEN + CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV) + CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL CUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR) + END IF +* + ELSE IF ( CONDR2 .LT. COND_OK ) THEN +* +* The matrix R2 is inverted. The solution of the matrix equation +* is Q3^* * V3 = the product of the Jacobi rotations (appplied to +* the lower triangular L3 from the LQ factorization of +* R2=L3*Q3), pre-multiplied with the transposed Q3. + CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, + $ LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 3870 p = 1, NR + CALL CCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL CSSCAL( NR, SVA(p), U(1,p), 1 ) + 3870 CONTINUE + CALL CTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N, + $ U,LDU) +* .. apply the permutation from the second QR factorization + DO 873 q = 1, NR + DO 872 p = 1, NR + CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 872 CONTINUE + DO 874 p = 1, NR + U(p,q) = CWORK(2*N+N*NR+NR+p) + 874 CONTINUE + 873 CONTINUE + IF ( NR .LT. N ) THEN + CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) + ELSE +* Last line of defense. +* #:( This is a rather pathological case: no scaled condition +* improvement after two pivoted QR factorizations. Other +* possibility is that the rank revealing QR factorization +* or the condition estimator has failed, or the COND_OK +* is set very close to ONE (which is unnecessary). Normally, +* this branch should never be executed, but in rare cases of +* failure of the RRQR or condition estimator, the last line of +* defense ensures that CGEJSV completes the task. +* Compute the full SVD of L3 using CGESVJ with explicit +* accumulation of Jacobi rotations. + CALL CGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U, + $ LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + IF ( NR .LT. N ) THEN + CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* + CALL CUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N, + $ CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1), + $ LWORK-2*N-N*NR-NR, IERR ) + DO 773 q = 1, NR + DO 772 p = 1, NR + CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 772 CONTINUE + DO 774 p = 1, NR + U(p,q) = CWORK(2*N+N*NR+NR+p) + 774 CONTINUE + 773 CONTINUE +* + END IF +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = SQRT(REAL(N)) * EPSLN + DO 1972 q = 1, N + DO 972 p = 1, N + CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 972 CONTINUE + DO 973 p = 1, N + V(p,q) = CWORK(2*N+N*NR+NR+p) + 973 CONTINUE + XSC = ONE / SCNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( N, XSC, V(1,q), 1 ) + 1972 CONTINUE +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). + IF ( NR .LT. M ) THEN + CALL CLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL CLASET('A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1),LDU) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* matrix U. This applies to all cases. +* + CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) + +* The columns of U are normalized. The cost is O(M*N) flops. + TEMP1 = SQRT(REAL(M)) * EPSLN + DO 1973 p = 1, NR + XSC = ONE / SCNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( M, XSC, U(1,p), 1 ) + 1973 CONTINUE +* +* If the initial QRF is computed with row pivoting, the left +* singular vectors must be adjusted. +* + IF ( ROWPIV ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + ELSE +* +* .. the initial matrix A has almost orthogonal columns and +* the second QRF is not needed +* + CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N ) + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 5970 p = 2, N + CTEMP = XSC * CWORK( N + (p-1)*N + p ) + DO 5971 q = 1, p - 1 +* CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) / +* $ ABS(CWORK(N+(p-1)*N+q)) ) + CWORK(N+(q-1)*N+p)=-CTEMP + 5971 CONTINUE + 5970 CONTINUE + ELSE + CALL CLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N ) + END IF +* + CALL CGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA, + $ N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK, + $ INFO ) +* + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 6970 p = 1, N + CALL CCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 ) + CALL CSSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 ) + 6970 CONTINUE +* + CALL CTRSM( 'L', 'U', 'N', 'N', N, N, + $ CONE, A, LDA, CWORK(N+1), N ) + DO 6972 p = 1, N + CALL CCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV ) + 6972 CONTINUE + TEMP1 = SQRT(REAL(N))*EPSLN + DO 6971 p = 1, N + XSC = ONE / SCNRM2( N, V(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( N, XSC, V(1,p), 1 ) + 6971 CONTINUE +* +* Assemble the left singular vector matrix U (M x N). +* + IF ( N .LT. M ) THEN + CALL CLASET( 'A', M-N, N, CZERO, CZERO, U(N+1,1), LDU ) + IF ( N .LT. N1 ) THEN + CALL CLASET('A',N, N1-N, CZERO, CZERO, U(1,N+1),LDU) + CALL CLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU) + END IF + END IF + CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) + TEMP1 = SQRT(REAL(M))*EPSLN + DO 6973 p = 1, N1 + XSC = ONE / SCNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( M, XSC, U(1,p), 1 ) + 6973 CONTINUE +* + IF ( ROWPIV ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + END IF +* +* end of the >> almost orthogonal case << in the full SVD +* + ELSE +* +* This branch deploys a preconditioned Jacobi SVD with explicitly +* accumulated rotations. It is included as optional, mainly for +* experimental purposes. It does perform well, and can also be used. +* In this implementation, this branch will be automatically activated +* if the condition number sigma_max(A) / sigma_min(A) is predicted +* to be greater than the overflow threshold. This is because the +* a posteriori computation of the singular vectors assumes robust +* implementation of BLAS and some LAPACK procedures, capable of working +* in presence of extreme values, e.g. when the singular values spread from +* the underflow to the overflow threshold. +* + DO 7968 p = 1, NR + CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL CLACGV( N-p+1, V(p,p), 1 ) + 7968 CONTINUE +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL/EPSLN) + DO 5969 q = 1, NR + CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO) + DO 5968 p = 1, N + IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) +* $ V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) ) + $ V(p,q) = CTEMP + IF ( p .LT. q ) V(p,q) = - V(p,q) + 5968 CONTINUE + 5969 CONTINUE + ELSE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV ) + END IF + + CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) + CALL CLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N ) +* + DO 7969 p = 1, NR + CALL CCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 ) + CALL CLACGV( NR-p+1, U(p,p), 1 ) + 7969 CONTINUE + + IF ( L2PERT ) THEN + XSC = SQRT(SMALL/EPSLN) + DO 9970 q = 2, NR + DO 9971 p = 1, q - 1 + CTEMP = CMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))), + $ ZERO) +* U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) ) + U(p,q) = - CTEMP + 9971 CONTINUE + 9970 CONTINUE + ELSE + CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) + END IF + + CALL CGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA, + $ N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + + IF ( NR .LT. N ) THEN + CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV ) + END IF + + CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = SQRT(REAL(N)) * EPSLN + DO 7972 q = 1, N + DO 8972 p = 1, N + CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 8972 CONTINUE + DO 8973 p = 1, N + V(p,q) = CWORK(2*N+N*NR+NR+p) + 8973 CONTINUE + XSC = ONE / SCNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( N, XSC, V(1,q), 1 ) + 7972 CONTINUE +* +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). +* + IF ( NR .LT. M ) THEN + CALL CLASET( 'A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL CLASET('A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU) + CALL CLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU) + END IF + END IF +* + CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* +* + END IF + IF ( TRANSP ) THEN +* .. swap U and V because the procedure worked on A^* + DO 6974 p = 1, N + CALL CSWAP( N, U(1,p), 1, V(1,p), 1 ) + 6974 CONTINUE + END IF +* + END IF +* end of the full SVD +* +* Undo scaling, if necessary (and possible) +* + IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN + CALL SLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR ) + USCAL1 = ONE + USCAL2 = ONE + END IF +* + IF ( NR .LT. N ) THEN + DO 3004 p = NR+1, N + SVA(p) = ZERO + 3004 CONTINUE + END IF +* + RWORK(1) = USCAL2 * SCALEM + RWORK(2) = USCAL1 + IF ( ERREST ) RWORK(3) = SCONDA + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = CONDR1 + RWORK(5) = CONDR2 + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ENTRA + RWORK(7) = ENTRAT + END IF +* + IWORK(1) = NR + IWORK(2) = NUMRANK + IWORK(3) = WARNING + IF ( TRANSP ) THEN + IWORK(4) = 1 + ELSE + IWORK(4) = -1 + END IF + +* + RETURN +* .. +* .. END OF CGEJSV +* .. + END +* diff --git a/lapack-netlib/cgesvx.f b/lapack-netlib/cgesvx.f new file mode 100644 index 000000000..74a37e9a0 --- /dev/null +++ b/lapack-netlib/cgesvx.f @@ -0,0 +1,602 @@ +*> \brief CGESVX computes the solution to system of linear equations A * X = B for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGESVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, +* EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, +* WORK, RWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS +* REAL RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ) +* REAL BERR( * ), C( * ), FERR( * ), R( * ), +* $ RWORK( * ) +* COMPLEX A( LDA, * ), AF( LDAF, * ), B( LDB, * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGESVX uses the LU factorization to compute the solution to a complex +*> system of linear equations +*> A * X = B, +*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = P * L * U, +*> where P is a permutation matrix, L is a unit lower triangular +*> matrix, and U is upper triangular. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AF and IPIV contain the factored form of A. +*> If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> A, AF, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AF and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AF and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations: +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the N-by-N matrix A. If FACT = 'F' and EQUED is +*> not 'N', then A must have been equilibrated by the scaling +*> factors in R and/or C. A is not modified if FACT = 'F' or +*> 'N', or if FACT = 'E' and EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] AF +*> \verbatim +*> AF is COMPLEX array, dimension (LDAF,N) +*> If FACT = 'F', then AF is an input argument and on entry +*> contains the factors L and U from the factorization +*> A = P*L*U as computed by CGETRF. If EQUED .ne. 'N', then +*> AF is the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the equilibrated matrix A (see the description of A for +*> the form of the equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAF +*> \verbatim +*> LDAF is INTEGER +*> The leading dimension of the array AF. LDAF >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = P*L*U +*> as computed by CGETRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is REAL array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,NRHS) +*> On entry, the N-by-NRHS right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is REAL +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is REAL array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is REAL array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (MAX(1,2*N)) +*> On exit, RWORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If RWORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 RWORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization has +*> been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGEsolve +* +* ===================================================================== + SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, + $ EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, + $ WORK, RWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS + REAL RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL BERR( * ), C( * ), FERR( * ), R( * ), + $ RWORK( * ) + COMPLEX A( LDA, * ), AF( LDAF, * ), B( LDB, * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J + REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + REAL CLANGE, CLANTR, SLAMCH + EXTERNAL LSAME, CLANGE, CLANTR, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL CGECON, CGEEQU, CGERFS, CGETRF, CGETRS, CLACPY, + $ CLAQGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = SLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -10 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -11 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -12 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -14 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -16 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGESVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL CGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL CLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, + $ EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of A. +* + CALL CLACPY( 'Full', N, N, A, LDA, AF, LDAF ) + CALL CGETRF( N, N, AF, LDAF, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + RPVGRW = CLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF, + $ RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = CLANGE( 'M', N, INFO, A, LDA, RWORK ) / + $ RPVGRW + END IF + RWORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = CLANGE( NORM, N, N, A, LDA, RWORK ) + RPVGRW = CLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = CLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL CGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL CGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL CGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X, + $ LDX, FERR, BERR, WORK, RWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 80 J = 1, NRHS + DO 70 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 70 CONTINUE + 80 CONTINUE + DO 90 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 90 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 120 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + RWORK( 1 ) = RPVGRW + RETURN +* +* End of CGESVX +* + END diff --git a/lapack-netlib/dgbsvx.f b/lapack-netlib/dgbsvx.f new file mode 100644 index 000000000..0ee5eecb3 --- /dev/null +++ b/lapack-netlib/dgbsvx.f @@ -0,0 +1,639 @@ +*> \brief DGBSVX computes the solution to system of linear equations A * X = B for GB matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGBSVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, +* LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, +* RCOND, FERR, BERR, WORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS +* DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ), IWORK( * ) +* DOUBLE PRECISION AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), +* $ BERR( * ), C( * ), FERR( * ), R( * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGBSVX uses the LU factorization to compute the solution to a real +*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B, +*> where A is a band matrix of order N with KL subdiagonals and KU +*> superdiagonals, and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed by this subroutine: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = L * U, +*> where L is a product of permutation and unit lower triangular +*> matrices with KL subdiagonals, and U is upper triangular with +*> KL+KU superdiagonals. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AFB and IPIV contain the factored form of +*> A. If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> AB, AFB, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AFB and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AFB and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations. +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KL +*> \verbatim +*> KL is INTEGER +*> The number of subdiagonals within the band of A. KL >= 0. +*> \endverbatim +*> +*> \param[in] KU +*> \verbatim +*> KU is INTEGER +*> The number of superdiagonals within the band of A. KU >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is DOUBLE PRECISION array, dimension (LDAB,N) +*> On entry, the matrix A in band storage, in rows 1 to KL+KU+1. +*> The j-th column of A is stored in the j-th column of the +*> array AB as follows: +*> AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl) +*> +*> If FACT = 'F' and EQUED is not 'N', then A must have been +*> equilibrated by the scaling factors in R and/or C. AB is not +*> modified if FACT = 'F' or 'N', or if FACT = 'E' and +*> EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] AFB +*> \verbatim +*> AFB is DOUBLE PRECISION array, dimension (LDAFB,N) +*> If FACT = 'F', then AFB is an input argument and on entry +*> contains details of the LU factorization of the band matrix +*> A, as computed by DGBTRF. U is stored as an upper triangular +*> band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1, +*> and the multipliers used during the factorization are stored +*> in rows KL+KU+2 to 2*KL+KU+1. If EQUED .ne. 'N', then AFB is +*> the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AFB is an output argument and on exit +*> returns details of the LU factorization of A. +*> +*> If FACT = 'E', then AFB is an output argument and on exit +*> returns details of the LU factorization of the equilibrated +*> matrix A (see the description of AB for the form of the +*> equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAFB +*> \verbatim +*> LDAFB is INTEGER +*> The leading dimension of the array AFB. LDAFB >= 2*KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = L*U +*> as computed by DGBTRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is DOUBLE PRECISION array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,NRHS) +*> On entry, the right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is DOUBLE PRECISION +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is DOUBLE PRECISION array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is DOUBLE PRECISION array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,3*N)) +*> On exit, WORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If WORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 WORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization +*> has been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGBsolve +* +* ===================================================================== + SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, + $ LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, + $ RCOND, FERR, BERR, WORK, IWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS + DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ), IWORK( * ) + DOUBLE PRECISION AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), + $ BERR( * ), C( * ), FERR( * ), R( * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J, J1, J2 + DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DLAMCH, DLANGB, DLANTB + EXTERNAL LSAME, DLAMCH, DLANGB, DLANTB +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DGBCON, DGBEQU, DGBRFS, DGBTRF, DGBTRS, + $ DLACPY, DLAQGB, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = DLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KL.LT.0 ) THEN + INFO = -4 + ELSE IF( KU.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDAB.LT.KL+KU+1 ) THEN + INFO = -8 + ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN + INFO = -10 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -12 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -13 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -14 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -16 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -18 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGBSVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL DGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL DLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of the band matrix A. +* + DO 70 J = 1, N + J1 = MAX( J-KU, 1 ) + J2 = MIN( J+KL, N ) + CALL DCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1, + $ AFB( KL+KU+1-J+J1, J ), 1 ) + 70 CONTINUE +* + CALL DGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + ANORM = ZERO + DO 90 J = 1, INFO + DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 ) + ANORM = MAX( ANORM, ABS( AB( I, J ) ) ) + 80 CONTINUE + 90 CONTINUE + RPVGRW = DLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ), + $ AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB, + $ WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ANORM / RPVGRW + END IF + WORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = DLANGB( NORM, N, KL, KU, AB, LDAB, WORK ) + RPVGRW = DLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = DLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL DGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND, + $ WORK, IWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL DGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX, + $ INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL DGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV, + $ B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 120 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 140 J = 1, NRHS + DO 130 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 130 CONTINUE + 140 CONTINUE + DO 150 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 150 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.DLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + WORK( 1 ) = RPVGRW + RETURN +* +* End of DGBSVX +* + END diff --git a/lapack-netlib/dgejsv.f b/lapack-netlib/dgejsv.f new file mode 100644 index 000000000..ee769bb38 --- /dev/null +++ b/lapack-netlib/dgejsv.f @@ -0,0 +1,1780 @@ +*> \brief \b DGEJSV +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGEJSV + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, +* M, N, A, LDA, SVA, U, LDU, V, LDV, +* WORK, LWORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* INTEGER INFO, LDA, LDU, LDV, LWORK, M, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ), +* $ WORK( LWORK ) +* INTEGER IWORK( * ) +* CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGEJSV computes the singular value decomposition (SVD) of a real M-by-N +*> matrix [A], where M >= N. The SVD of [A] is written as +*> +*> [A] = [U] * [SIGMA] * [V]^t, +*> +*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N +*> diagonal elements, [U] is an M-by-N (or M-by-M) orthonormal matrix, and +*> [V] is an N-by-N orthogonal matrix. The diagonal elements of [SIGMA] are +*> the singular values of [A]. The columns of [U] and [V] are the left and +*> the right singular vectors of [A], respectively. The matrices [U] and [V] +*> are computed and stored in the arrays U and V, respectively. The diagonal +*> of [SIGMA] is computed and stored in the array SVA. +*> DGEJSV can sometimes compute tiny singular values and their singular vectors much +*> more accurately than other SVD routines, see below under Further Details. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy: +*> = 'C': This option works well (high relative accuracy) if A = B * D, +*> with well-conditioned B and arbitrary diagonal matrix D. +*> The accuracy cannot be spoiled by COLUMN scaling. The +*> accuracy of the computed output depends on the condition of +*> B, and the procedure aims at the best theoretical accuracy. +*> The relative error max_{i=1:N}|d sigma_i| / sigma_i is +*> bounded by f(M,N)*epsilon* cond(B), independent of D. +*> The input matrix is preprocessed with the QRF with column +*> pivoting. This initial preprocessing and preconditioning by +*> a rank revealing QR factorization is common for all values of +*> JOBA. Additional actions are specified as follows: +*> = 'E': Computation as with 'C' with an additional estimate of the +*> condition number of B. It provides a realistic error bound. +*> = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings +*> D1, D2, and well-conditioned matrix C, this option gives +*> higher accuracy than the 'C' option. If the structure of the +*> input matrix is not known, and relative accuracy is +*> desirable, then this option is advisable. The input matrix A +*> is preprocessed with QR factorization with FULL (row and +*> column) pivoting. +*> = 'G': Computation as with 'F' with an additional estimate of the +*> condition number of B, where A=D*B. If A has heavily weighted +*> rows, then using this condition number gives too pessimistic +*> error bound. +*> = 'A': Small singular values are the noise and the matrix is treated +*> as numerically rank deficient. The error in the computed +*> singular values is bounded by f(m,n)*epsilon*||A||. +*> The computed SVD A = U * S * V^t restores A up to +*> f(m,n)*epsilon*||A||. +*> This gives the procedure the licence to discard (set to zero) +*> all singular values below N*epsilon*||A||. +*> = 'R': Similar as in 'A'. Rank revealing property of the initial +*> QR factorization is used do reveal (using triangular factor) +*> a gap sigma_{r+1} < epsilon * sigma_r in which case the +*> numerical RANK is declared to be r. The SVD is computed with +*> absolute error bounds, but more accurately than with 'A'. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> Specifies whether to compute the columns of U: +*> = 'U': N columns of U are returned in the array U. +*> = 'F': full set of M left sing. vectors is returned in the array U. +*> = 'W': U may be used as workspace of length M*N. See the description +*> of U. +*> = 'N': U is not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> Specifies whether to compute the matrix V: +*> = 'V': N columns of V are returned in the array V; Jacobi rotations +*> are not explicitly accumulated. +*> = 'J': N columns of V are returned in the array V, but they are +*> computed as the product of Jacobi rotations. This option is +*> allowed only if JOBU .NE. 'N', i.e. in computing the full SVD. +*> = 'W': V may be used as workspace of length N*N. See the description +*> of V. +*> = 'N': V is not computed. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> Specifies the RANGE for the singular values. Issues the licence to +*> set to zero small positive singular values if they are outside +*> specified range. If A .NE. 0 is scaled so that the largest singular +*> value of c*A is around DSQRT(BIG), BIG=SLAMCH('O'), then JOBR issues +*> the licence to kill columns of A whose norm in c*A is less than +*> DSQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, +*> where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E'). +*> = 'N': Do not kill small columns of c*A. This option assumes that +*> BLAS and QR factorizations and triangular solvers are +*> implemented to work in that range. If the condition of A +*> is greater than BIG, use DGESVJ. +*> = 'R': RESTRICTED range for sigma(c*A) is [DSQRT(SFMIN), DSQRT(BIG)] +*> (roughly, as described above). This option is recommended. +*> ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*> For computing the singular values in the FULL range [SFMIN,BIG] +*> use DGESVJ. +*> \endverbatim +*> +*> \param[in] JOBT +*> \verbatim +*> JOBT is CHARACTER*1 +*> If the matrix is square then the procedure may determine to use +*> transposed A if A^t seems to be better with respect to convergence. +*> If the matrix is not square, JOBT is ignored. This is subject to +*> changes in the future. +*> The decision is based on two values of entropy over the adjoint +*> orbit of A^t * A. See the descriptions of WORK(6) and WORK(7). +*> = 'T': transpose if entropy test indicates possibly faster +*> convergence of Jacobi process if A^t is taken as input. If A is +*> replaced with A^t, then the row pivoting is included automatically. +*> = 'N': do not speculate. +*> This option can be used to compute only the singular values, or the +*> full SVD (U, SIGMA and V). For only one set of singular vectors +*> (U or V), the caller should provide both U and V, as one of the +*> matrices is used as workspace if the matrix A is transposed. +*> The implementer can easily remove this constraint and make the +*> code more complicated. See the descriptions of U and V. +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> Issues the licence to introduce structured perturbations to drown +*> denormalized numbers. This licence should be active if the +*> denormals are poorly implemented, causing slow computation, +*> especially in cases of fast convergence (!). For details see [1,2]. +*> For the sake of simplicity, this perturbations are included only +*> when the full SVD or only the singular values are requested. The +*> implementer/user can easily add the perturbation for the cases of +*> computing one set of singular vectors. +*> = 'P': introduce perturbation +*> = 'N': do not perturb +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] SVA +*> \verbatim +*> SVA is DOUBLE PRECISION array, dimension (N) +*> On exit, +*> - For WORK(1)/WORK(2) = ONE: The singular values of A. During the +*> computation SVA contains Euclidean column norms of the +*> iterated matrices in the array A. +*> - For WORK(1) .NE. WORK(2): The singular values of A are +*> (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if +*> sigma_max(A) overflows or if small singular values have been +*> saved from underflow by scaling the input matrix A. +*> - If JOBR='R' then some of the singular values may be returned +*> as exact zeros obtained by "set to zero" because they are +*> below the numerical rank threshold or are denormalized numbers. +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is DOUBLE PRECISION array, dimension ( LDU, N ) or ( LDU, M ) +*> If JOBU = 'U', then U contains on exit the M-by-N matrix of +*> the left singular vectors. +*> If JOBU = 'F', then U contains on exit the M-by-M matrix of +*> the left singular vectors, including an ONB +*> of the orthogonal complement of the Range(A). +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), +*> then U is used as workspace if the procedure +*> replaces A with A^t. In that case, [V] is computed +*> in U as left singular vectors of A^t and then +*> copied back to the V array. This 'W' option is just +*> a reminder to the caller that in this case U is +*> reserved as workspace of length N*N. +*> If JOBU = 'N' U is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER +*> The leading dimension of the array U, LDU >= 1. +*> IF JOBU = 'U' or 'F' or 'W', then LDU >= M. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension ( LDV, N ) +*> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of +*> the right singular vectors; +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), +*> then V is used as workspace if the pprocedure +*> replaces A with A^t. In that case, [U] is computed +*> in V as right singular vectors of A^t and then +*> copied back to the U array. This 'W' option is just +*> a reminder to the caller that in this case V is +*> reserved as workspace of length N*N. +*> If JOBV = 'N' V is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V' or 'J' or 'W', then LDV >= N. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(7,LWORK)) +*> On exit, if N > 0 .AND. M > 0 (else not referenced), +*> WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such +*> that SCALE*SVA(1:N) are the computed singular values +*> of A. (See the description of SVA().) +*> WORK(2) = See the description of WORK(1). +*> WORK(3) = SCONDA is an estimate for the condition number of +*> column equilibrated A. (If JOBA = 'E' or 'G') +*> SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1). +*> It is computed using DPOCON. It holds +*> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +*> where R is the triangular factor from the QRF of A. +*> However, if R is truncated and the numerical rank is +*> determined to be strictly smaller than N, SCONDA is +*> returned as -1, thus indicating that the smallest +*> singular values might be lost. +*> +*> If full SVD is needed, the following two condition numbers are +*> useful for the analysis of the algorithm. They are provided for +*> a developer/implementer who is familiar with the details of +*> the method. +*> +*> WORK(4) = an estimate of the scaled condition number of the +*> triangular factor in the first QR factorization. +*> WORK(5) = an estimate of the scaled condition number of the +*> triangular factor in the second QR factorization. +*> The following two parameters are computed if JOBT = 'T'. +*> They are provided for a developer/implementer who is familiar +*> with the details of the method. +*> +*> WORK(6) = the entropy of A^t*A :: this is the Shannon entropy +*> of diag(A^t*A) / Trace(A^t*A) taken as point in the +*> probability simplex. +*> WORK(7) = the entropy of A*A^t. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> Length of WORK to confirm proper allocation of work space. +*> LWORK depends on the job: +*> +*> If only SIGMA is needed (JOBU = 'N', JOBV = 'N') and +*> -> .. no scaled condition estimate required (JOBE = 'N'): +*> LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal +*> block size for DGEQP3 and DGEQRF. +*> In general, optimal LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), 7). +*> -> .. an estimate of the scaled condition number of A is +*> required (JOBA='E', 'G'). In this case, LWORK is the maximum +*> of the above and N*N+4*N, i.e. LWORK >= max(2*M+N,N*N+4*N,7). +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= max(2*M+N,3*N+(N+1)*NB, N*N+4*N, 7). +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), +*> N+N*N+LWORK(DPOCON),7). +*> +*> If SIGMA and the right singular vectors are needed (JOBV = 'V'), +*> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). +*> -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7), +*> where NB is the optimal block size for DGEQP3, DGEQRF, DGELQF, +*> DORMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(DGEQP3), N+LWORK(DPOCON), +*> N+LWORK(DGELQF), 2*N+LWORK(DGEQRF), N+LWORK(DORMLQ)). +*> +*> If SIGMA and the left singular vectors are needed +*> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7), +*> if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7), +*> where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON), +*> 2*N+LWORK(DGEQRF), N+LWORK(DORMQR)). +*> Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or +*> M*NB (for JOBU = 'F'). +*> +*> If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> -> if JOBV = 'V' +*> the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N). +*> -> if JOBV = 'J' the minimal requirement is +*> LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6). +*> -> For optimal performance, LWORK should be additionally +*> larger than N+M*NB, where NB is the optimal block size +*> for DORMQR. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(3,M+3*N)). +*> On exit, +*> IWORK(1) = the numerical rank determined after the initial +*> QR factorization with pivoting. See the descriptions +*> of JOBA and JOBR. +*> IWORK(2) = the number of the computed nonzero singular values +*> IWORK(3) = if nonzero, a warning message: +*> If IWORK(3) = 1 then some of the column norms of A +*> were denormalized floats. The requested high accuracy +*> is not warranted by the data. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: DGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGEsing +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> DGEJSV implements a preconditioned Jacobi SVD algorithm. It uses DGEQP3, +*> DGEQRF, and DGELQF as preprocessors and preconditioners. Optionally, an +*> additional row pivoting can be used as a preprocessor, which in some +*> cases results in much higher accuracy. An example is matrix A with the +*> structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned +*> diagonal matrices and C is well-conditioned matrix. In that case, complete +*> pivoting in the first QR factorizations provides accuracy dependent on the +*> condition number of C, and independent of D1, D2. Such higher accuracy is +*> not completely understood theoretically, but it works well in practice. +*> Further, if A can be written as A = B*D, with well-conditioned B and some +*> diagonal D, then the high accuracy is guaranteed, both theoretically and +*> in software, independent of D. For more details see [1], [2]. +*> The computational range for the singular values can be the full range +*> ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS +*> & LAPACK routines called by DGEJSV are implemented to work in that range. +*> If that is not the case, then the restriction for safe computation with +*> the singular values in the range of normalized IEEE numbers is that the +*> spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not +*> overflow. This code (DGEJSV) is best used in this restricted range, +*> meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are +*> returned as zeros. See JOBR for details on this. +*> Further, this implementation is somewhat slower than the one described +*> in [1,2] due to replacement of some non-LAPACK components, and because +*> the choice of some tuning parameters in the iterative part (DGESVJ) is +*> left to the implementer on a particular machine. +*> The rank revealing QR factorization (in this code: DGEQP3) should be +*> implemented as in [3]. We have a new version of DGEQP3 under development +*> that is more robust than the current one in LAPACK, with a cleaner cut in +*> rank deficient cases. It will be available in the SIGMA library [4]. +*> If M is much larger than N, it is obvious that the initial QRF with +*> column pivoting can be preprocessed by the QRF without pivoting. That +*> well known trick is not used in DGEJSV because in some cases heavy row +*> weighting can be treated with complete pivoting. The overhead in cases +*> M much larger than N is then only due to pivoting, but the benefits in +*> terms of accuracy have prevailed. The implementer/user can incorporate +*> this extra QRF step easily. The implementer can also improve data movement +*> (matrix transpose, matrix copy, matrix transposed copy) - this +*> implementation of DGEJSV uses only the simplest, naive data movement. +*> \endverbatim +* +*> \par Contributors: +* ================== +*> +*> Zlatko Drmac (Zagreb, Croatia) and Kresimir Veselic (Hagen, Germany) +* +*> \par References: +* ================ +*> +*> \verbatim +*> +*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342. +*> LAPACK Working note 169. +*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362. +*> LAPACK Working note 170. +*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR +*> factorization software - a case study. +*> ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28. +*> LAPACK Working note 176. +*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV, +*> QSVD, (H,K)-SVD computations. +*> Department of Mathematics, University of Zagreb, 2008. +*> \endverbatim +* +*> \par Bugs, examples and comments: +* ================================= +*> +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> +* ===================================================================== + SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, + $ M, N, A, LDA, SVA, U, LDU, V, LDV, + $ WORK, LWORK, IWORK, INFO ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + IMPLICIT NONE + INTEGER INFO, LDA, LDU, LDV, LWORK, M, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ), + $ WORK( LWORK ) + INTEGER IWORK( * ) + CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* =========================================================================== +* +* .. Local Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, COND_OK, + $ CONDR1, CONDR2, ENTRA, ENTRAT, EPSLN, MAXPRJ, SCALEM, + $ SCONDA, SFMIN, SMALL, TEMP1, USCAL1, USCAL2, XSC + INTEGER IERR, N1, NR, NUMRANK, p, q, WARNING + LOGICAL ALMORT, DEFR, ERREST, GOSCAL, JRACC, KILL, LSVEC, + $ L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN, + $ NOSCAL, ROWPIV, RSVEC, TRANSP +* .. +* .. Intrinsic Functions .. + INTRINSIC DABS, DLOG, MAX, MIN, DBLE, IDNINT, DSIGN, DSQRT +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DNRM2 + INTEGER IDAMAX + LOGICAL LSAME + EXTERNAL IDAMAX, LSAME, DLAMCH, DNRM2 +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DGELQF, DGEQP3, DGEQRF, DLACPY, DLASCL, + $ DLASET, DLASSQ, DLASWP, DORGQR, DORMLQ, + $ DORMQR, DPOCON, DSCAL, DSWAP, DTRSM, XERBLA +* + EXTERNAL DGESVJ +* .. +* +* Test the input arguments +* + LSVEC = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' ) + JRACC = LSAME( JOBV, 'J' ) + RSVEC = LSAME( JOBV, 'V' ) .OR. JRACC + ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' ) + L2RANK = LSAME( JOBA, 'R' ) + L2ABER = LSAME( JOBA, 'A' ) + ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' ) + L2TRAN = LSAME( JOBT, 'T' ) + L2KILL = LSAME( JOBR, 'R' ) + DEFR = LSAME( JOBR, 'N' ) + L2PERT = LSAME( JOBP, 'P' ) +* + IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR. + $ ERREST .OR. LSAME( JOBA, 'C' ) )) THEN + INFO = - 1 + ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR. + $ LSAME( JOBU, 'W' )) ) THEN + INFO = - 2 + ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR. + $ LSAME( JOBV, 'W' )) .OR. ( JRACC .AND. (.NOT.LSVEC) ) ) THEN + INFO = - 3 + ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) ) THEN + INFO = - 4 + ELSE IF ( .NOT. ( L2TRAN .OR. LSAME( JOBT, 'N' ) ) ) THEN + INFO = - 5 + ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = - 6 + ELSE IF ( M .LT. 0 ) THEN + INFO = - 7 + ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN + INFO = - 8 + ELSE IF ( LDA .LT. M ) THEN + INFO = - 10 + ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN + INFO = - 13 + ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN + INFO = - 15 + ELSE IF ( (.NOT.(LSVEC .OR. RSVEC .OR. ERREST).AND. + & (LWORK .LT. MAX(7,4*N+1,2*M+N))) .OR. + & (.NOT.(LSVEC .OR. RSVEC) .AND. ERREST .AND. + & (LWORK .LT. MAX(7,4*N+N*N,2*M+N))) .OR. + & (LSVEC .AND. (.NOT.RSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1))) + & .OR. + & (RSVEC .AND. (.NOT.LSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1))) + & .OR. + & (LSVEC .AND. RSVEC .AND. (.NOT.JRACC) .AND. + & (LWORK.LT.MAX(2*M+N,6*N+2*N*N))) + & .OR. (LSVEC .AND. RSVEC .AND. JRACC .AND. + & LWORK.LT.MAX(2*M+N,4*N+N*N,2*N+N*N+6))) + & THEN + INFO = - 17 + ELSE +* #:) + INFO = 0 + END IF +* + IF ( INFO .NE. 0 ) THEN +* #:( + CALL XERBLA( 'DGEJSV', - INFO ) + RETURN + END IF +* +* Quick return for void matrix (Y3K safe) +* #:) + IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN + IWORK(1:3) = 0 + WORK(1:7) = 0 + RETURN + ENDIF +* +* Determine whether the matrix U should be M x N or M x M +* + IF ( LSVEC ) THEN + N1 = N + IF ( LSAME( JOBU, 'F' ) ) N1 = M + END IF +* +* Set numerical parameters +* +*! NOTE: Make sure DLAMCH() does not fail on the target architecture. +* + EPSLN = DLAMCH('Epsilon') + SFMIN = DLAMCH('SafeMinimum') + SMALL = SFMIN / EPSLN + BIG = DLAMCH('O') +* BIG = ONE / SFMIN +* +* Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N +* +*(!) If necessary, scale SVA() to protect the largest norm from +* overflow. It is possible that this scaling pushes the smallest +* column norm left from the underflow threshold (extreme case). +* + SCALEM = ONE / DSQRT(DBLE(M)*DBLE(N)) + NOSCAL = .TRUE. + GOSCAL = .TRUE. + DO 1874 p = 1, N + AAPP = ZERO + AAQQ = ONE + CALL DLASSQ( M, A(1,p), 1, AAPP, AAQQ ) + IF ( AAPP .GT. BIG ) THEN + INFO = - 9 + CALL XERBLA( 'DGEJSV', -INFO ) + RETURN + END IF + AAQQ = DSQRT(AAQQ) + IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL ) THEN + SVA(p) = AAPP * AAQQ + ELSE + NOSCAL = .FALSE. + SVA(p) = AAPP * ( AAQQ * SCALEM ) + IF ( GOSCAL ) THEN + GOSCAL = .FALSE. + CALL DSCAL( p-1, SCALEM, SVA, 1 ) + END IF + END IF + 1874 CONTINUE +* + IF ( NOSCAL ) SCALEM = ONE +* + AAPP = ZERO + AAQQ = BIG + DO 4781 p = 1, N + AAPP = MAX( AAPP, SVA(p) ) + IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) ) + 4781 CONTINUE +* +* Quick return for zero M x N matrix +* #:) + IF ( AAPP .EQ. ZERO ) THEN + IF ( LSVEC ) CALL DLASET( 'G', M, N1, ZERO, ONE, U, LDU ) + IF ( RSVEC ) CALL DLASET( 'G', N, N, ZERO, ONE, V, LDV ) + WORK(1) = ONE + WORK(2) = ONE + IF ( ERREST ) WORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + WORK(4) = ONE + WORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + WORK(6) = ZERO + WORK(7) = ZERO + END IF + IWORK(1) = 0 + IWORK(2) = 0 + IWORK(3) = 0 + RETURN + END IF +* +* Issue warning if denormalized column norms detected. Override the +* high relative accuracy request. Issue licence to kill columns +* (set them to zero) whose norm is less than sigma_max / BIG (roughly). +* #:( + WARNING = 0 + IF ( AAQQ .LE. SFMIN ) THEN + L2RANK = .TRUE. + L2KILL = .TRUE. + WARNING = 1 + END IF +* +* Quick return for one-column matrix +* #:) + IF ( N .EQ. 1 ) THEN +* + IF ( LSVEC ) THEN + CALL DLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR ) + CALL DLACPY( 'A', M, 1, A, LDA, U, LDU ) +* computing all M left singular vectors of the M x 1 matrix + IF ( N1 .NE. N ) THEN + CALL DGEQRF( M, N, U,LDU, WORK, WORK(N+1),LWORK-N,IERR ) + CALL DORGQR( M,N1,1, U,LDU,WORK,WORK(N+1),LWORK-N,IERR ) + CALL DCOPY( M, A(1,1), 1, U(1,1), 1 ) + END IF + END IF + IF ( RSVEC ) THEN + V(1,1) = ONE + END IF + IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN + SVA(1) = SVA(1) / SCALEM + SCALEM = ONE + END IF + WORK(1) = ONE / SCALEM + WORK(2) = ONE + IF ( SVA(1) .NE. ZERO ) THEN + IWORK(1) = 1 + IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN + IWORK(2) = 1 + ELSE + IWORK(2) = 0 + END IF + ELSE + IWORK(1) = 0 + IWORK(2) = 0 + END IF + IWORK(3) = 0 + IF ( ERREST ) WORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + WORK(4) = ONE + WORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + WORK(6) = ZERO + WORK(7) = ZERO + END IF + RETURN +* + END IF +* + TRANSP = .FALSE. + L2TRAN = L2TRAN .AND. ( M .EQ. N ) +* + AATMAX = -ONE + AATMIN = BIG + IF ( ROWPIV .OR. L2TRAN ) THEN +* +* Compute the row norms, needed to determine row pivoting sequence +* (in the case of heavily row weighted A, row pivoting is strongly +* advised) and to collect information needed to compare the +* structures of A * A^t and A^t * A (in the case L2TRAN.EQ..TRUE.). +* + IF ( L2TRAN ) THEN + DO 1950 p = 1, M + XSC = ZERO + TEMP1 = ONE + CALL DLASSQ( N, A(p,1), LDA, XSC, TEMP1 ) +* DLASSQ gets both the ell_2 and the ell_infinity norm +* in one pass through the vector + WORK(M+N+p) = XSC * SCALEM + WORK(N+p) = XSC * (SCALEM*DSQRT(TEMP1)) + AATMAX = MAX( AATMAX, WORK(N+p) ) + IF (WORK(N+p) .NE. ZERO) AATMIN = MIN(AATMIN,WORK(N+p)) + 1950 CONTINUE + ELSE + DO 1904 p = 1, M + WORK(M+N+p) = SCALEM*DABS( A(p,IDAMAX(N,A(p,1),LDA)) ) + AATMAX = MAX( AATMAX, WORK(M+N+p) ) + AATMIN = MIN( AATMIN, WORK(M+N+p) ) + 1904 CONTINUE + END IF +* + END IF +* +* For square matrix A try to determine whether A^t would be better +* input for the preconditioned Jacobi SVD, with faster convergence. +* The decision is based on an O(N) function of the vector of column +* and row norms of A, based on the Shannon entropy. This should give +* the right choice in most cases when the difference actually matters. +* It may fail and pick the slower converging side. +* + ENTRA = ZERO + ENTRAT = ZERO + IF ( L2TRAN ) THEN +* + XSC = ZERO + TEMP1 = ONE + CALL DLASSQ( N, SVA, 1, XSC, TEMP1 ) + TEMP1 = ONE / TEMP1 +* + ENTRA = ZERO + DO 1113 p = 1, N + BIG1 = ( ( SVA(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1) + 1113 CONTINUE + ENTRA = - ENTRA / DLOG(DBLE(N)) +* +* Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex. +* It is derived from the diagonal of A^t * A. Do the same with the +* diagonal of A * A^t, compute the entropy of the corresponding +* probability distribution. Note that A * A^t and A^t * A have the +* same trace. +* + ENTRAT = ZERO + DO 1114 p = N+1, N+M + BIG1 = ( ( WORK(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1) + 1114 CONTINUE + ENTRAT = - ENTRAT / DLOG(DBLE(M)) +* +* Analyze the entropies and decide A or A^t. Smaller entropy +* usually means better input for the algorithm. +* + TRANSP = ( ENTRAT .LT. ENTRA ) +* +* If A^t is better than A, transpose A. +* + IF ( TRANSP ) THEN +* In an optimal implementation, this trivial transpose +* should be replaced with faster transpose. + DO 1115 p = 1, N - 1 + DO 1116 q = p + 1, N + TEMP1 = A(q,p) + A(q,p) = A(p,q) + A(p,q) = TEMP1 + 1116 CONTINUE + 1115 CONTINUE + DO 1117 p = 1, N + WORK(M+N+p) = SVA(p) + SVA(p) = WORK(N+p) + 1117 CONTINUE + TEMP1 = AAPP + AAPP = AATMAX + AATMAX = TEMP1 + TEMP1 = AAQQ + AAQQ = AATMIN + AATMIN = TEMP1 + KILL = LSVEC + LSVEC = RSVEC + RSVEC = KILL + IF ( LSVEC ) N1 = N +* + ROWPIV = .TRUE. + END IF +* + END IF +* END IF L2TRAN +* +* Scale the matrix so that its maximal singular value remains less +* than DSQRT(BIG) -- the matrix is scaled so that its maximal column +* has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep +* DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and +* BLAS routines that, in some implementations, are not capable of +* working in the full interval [SFMIN,BIG] and that they may provoke +* overflows in the intermediate results. If the singular values spread +* from SFMIN to BIG, then DGESVJ will compute them. So, in that case, +* one should use DGESVJ instead of DGEJSV. +* + BIG1 = DSQRT( BIG ) + TEMP1 = DSQRT( BIG / DBLE(N) ) +* + CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR ) + IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN + AAQQ = ( AAQQ / AAPP ) * TEMP1 + ELSE + AAQQ = ( AAQQ * TEMP1 ) / AAPP + END IF + TEMP1 = TEMP1 * SCALEM + CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR ) +* +* To undo scaling at the end of this procedure, multiply the +* computed singular values with USCAL2 / USCAL1. +* + USCAL1 = TEMP1 + USCAL2 = AAPP +* + IF ( L2KILL ) THEN +* L2KILL enforces computation of nonzero singular values in +* the restricted range of condition number of the initial A, +* sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN). + XSC = DSQRT( SFMIN ) + ELSE + XSC = SMALL +* +* Now, if the condition number of A is too big, +* sigma_max(A) / sigma_min(A) .GT. DSQRT(BIG/N) * EPSLN / SFMIN, +* as a precaution measure, the full SVD is computed using DGESVJ +* with accumulated Jacobi rotations. This provides numerically +* more robust computation, at the cost of slightly increased run +* time. Depending on the concrete implementation of BLAS and LAPACK +* (i.e. how they behave in presence of extreme ill-conditioning) the +* implementor may decide to remove this switch. + IF ( ( AAQQ.LT.DSQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN + JRACC = .TRUE. + END IF +* + END IF + IF ( AAQQ .LT. XSC ) THEN + DO 700 p = 1, N + IF ( SVA(p) .LT. XSC ) THEN + CALL DLASET( 'A', M, 1, ZERO, ZERO, A(1,p), LDA ) + SVA(p) = ZERO + END IF + 700 CONTINUE + END IF +* +* Preconditioning using QR factorization with pivoting +* + IF ( ROWPIV ) THEN +* Optional row permutation (Bjoerck row pivoting): +* A result by Cox and Higham shows that the Bjoerck's +* row pivoting combined with standard column pivoting +* has similar effect as Powell-Reid complete pivoting. +* The ell-infinity norms of A are made nonincreasing. + DO 1952 p = 1, M - 1 + q = IDAMAX( M-p+1, WORK(M+N+p), 1 ) + p - 1 + IWORK(2*N+p) = q + IF ( p .NE. q ) THEN + TEMP1 = WORK(M+N+p) + WORK(M+N+p) = WORK(M+N+q) + WORK(M+N+q) = TEMP1 + END IF + 1952 CONTINUE + CALL DLASWP( N, A, LDA, 1, M-1, IWORK(2*N+1), 1 ) + END IF +* +* End of the preparation phase (scaling, optional sorting and +* transposing, optional flushing of small columns). +* +* Preconditioning +* +* If the full SVD is needed, the right singular vectors are computed +* from a matrix equation, and for that we need theoretical analysis +* of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF. +* In all other cases the first RR QRF can be chosen by other criteria +* (eg speed by replacing global with restricted window pivoting, such +* as in SGEQPX from TOMS # 782). Good results will be obtained using +* SGEQPX with properly (!) chosen numerical parameters. +* Any improvement of DGEQP3 improves overall performance of DGEJSV. +* +* A * P1 = Q1 * [ R1^t 0]^t: + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL DGEQP3( M,N,A,LDA, IWORK,WORK, WORK(N+1),LWORK-N, IERR ) +* +* The upper triangular matrix R1 from the first QRF is inspected for +* rank deficiency and possibilities for deflation, or possible +* ill-conditioning. Depending on the user specified flag L2RANK, +* the procedure explores possibilities to reduce the numerical +* rank by inspecting the computed upper triangular factor. If +* L2RANK or L2ABER are up, then DGEJSV will compute the SVD of +* A + dA, where ||dA|| <= f(M,N)*EPSLN. +* + NR = 1 + IF ( L2ABER ) THEN +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPSLN*||A|| are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPSLN*||A||. + TEMP1 = DSQRT(DBLE(N))*EPSLN + DO 3001 p = 2, N + IF ( DABS(A(p,p)) .GE. (TEMP1*DABS(A(1,1))) ) THEN + NR = NR + 1 + ELSE + GO TO 3002 + END IF + 3001 CONTINUE + 3002 CONTINUE + ELSE IF ( L2RANK ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R1 is used as the criterion for +* close-to-rank-deficient. + TEMP1 = DSQRT(SFMIN) + DO 3401 p = 2, N + IF ( ( DABS(A(p,p)) .LT. (EPSLN*DABS(A(p-1,p-1))) ) .OR. + $ ( DABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* The goal is high relative accuracy. However, if the matrix +* has high scaled condition number the relative accuracy is in +* general not feasible. Later on, a condition number estimator +* will be deployed to estimate the scaled condition number. +* Here we just remove the underflowed part of the triangular +* factor. This prevents the situation in which the code is +* working hard to get the accuracy not warranted by the data. + TEMP1 = DSQRT(SFMIN) + DO 3301 p = 2, N + IF ( ( DABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302 + NR = NR + 1 + 3301 CONTINUE + 3302 CONTINUE +* + END IF +* + ALMORT = .FALSE. + IF ( NR .EQ. N ) THEN + MAXPRJ = ONE + DO 3051 p = 2, N + TEMP1 = DABS(A(p,p)) / SVA(IWORK(p)) + MAXPRJ = MIN( MAXPRJ, TEMP1 ) + 3051 CONTINUE + IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE. + END IF +* +* + SCONDA = - ONE + CONDR1 = - ONE + CONDR2 = - ONE +* + IF ( ERREST ) THEN + IF ( N .EQ. NR ) THEN + IF ( RSVEC ) THEN +* .. V is available as workspace + CALL DLACPY( 'U', N, N, A, LDA, V, LDV ) + DO 3053 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL DSCAL( p, ONE/TEMP1, V(1,p), 1 ) + 3053 CONTINUE + CALL DPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ WORK(N+1), IWORK(2*N+M+1), IERR ) + ELSE IF ( LSVEC ) THEN +* .. U is available as workspace + CALL DLACPY( 'U', N, N, A, LDA, U, LDU ) + DO 3054 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL DSCAL( p, ONE/TEMP1, U(1,p), 1 ) + 3054 CONTINUE + CALL DPOCON( 'U', N, U, LDU, ONE, TEMP1, + $ WORK(N+1), IWORK(2*N+M+1), IERR ) + ELSE + CALL DLACPY( 'U', N, N, A, LDA, WORK(N+1), N ) + DO 3052 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL DSCAL( p, ONE/TEMP1, WORK(N+(p-1)*N+1), 1 ) + 3052 CONTINUE +* .. the columns of R are scaled to have unit Euclidean lengths. + CALL DPOCON( 'U', N, WORK(N+1), N, ONE, TEMP1, + $ WORK(N+N*N+1), IWORK(2*N+M+1), IERR ) + END IF + SCONDA = ONE / DSQRT(TEMP1) +* SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1). +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA + ELSE + SCONDA = - ONE + END IF + END IF +* + L2PERT = L2PERT .AND. ( DABS( A(1,1)/A(NR,NR) ) .GT. DSQRT(BIG1) ) +* If there is no violent scaling, artificial perturbation is not needed. +* +* Phase 3: +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +* +* Singular Values only +* +* .. transpose A(1:NR,1:N) + DO 1946 p = 1, MIN( N-1, NR ) + CALL DCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 ) + 1946 CONTINUE +* +* The following two DO-loops introduce small relative perturbation +* into the strict upper triangle of the lower triangular matrix. +* Small entries below the main diagonal are also changed. +* This modification is useful if the computing environment does not +* provide/allow FLUSH TO ZERO underflow, for it prevents many +* annoying denormalized numbers in case of strongly scaled matrices. +* The perturbation is structured so that it does not introduce any +* new perturbation of the singular values, and it does not destroy +* the job done by the preconditioner. +* The licence for this perturbation is in the variable L2PERT, which +* should be .FALSE. if FLUSH TO ZERO underflow is active. +* + IF ( .NOT. ALMORT ) THEN +* + IF ( L2PERT ) THEN +* XSC = DSQRT(SMALL) + XSC = EPSLN / DBLE(N) + DO 4947 q = 1, NR + TEMP1 = XSC*DABS(A(q,q)) + DO 4949 p = 1, N + IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) + $ A(p,q) = DSIGN( TEMP1, A(p,q) ) + 4949 CONTINUE + 4947 CONTINUE + ELSE + CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, A(1,2),LDA ) + END IF +* +* .. second preconditioning using the QR factorization +* + CALL DGEQRF( N,NR, A,LDA, WORK, WORK(N+1),LWORK-N, IERR ) +* +* .. and transpose upper to lower triangular + DO 1948 p = 1, NR - 1 + CALL DCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 ) + 1948 CONTINUE +* + END IF +* +* Row-cyclic Jacobi SVD algorithm with column pivoting +* +* .. again some perturbation (a "background noise") is added +* to drown denormals + IF ( L2PERT ) THEN +* XSC = DSQRT(SMALL) + XSC = EPSLN / DBLE(N) + DO 1947 q = 1, NR + TEMP1 = XSC*DABS(A(q,q)) + DO 1949 p = 1, NR + IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) + $ A(p,q) = DSIGN( TEMP1, A(p,q) ) + 1949 CONTINUE + 1947 CONTINUE + ELSE + CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, A(1,2), LDA ) + END IF +* +* .. and one-sided Jacobi rotations are started on a lower +* triangular matrix (plus perturbation which is ignored in +* the part which destroys triangular form (confusing?!)) +* + CALL DGESVJ( 'L', 'NoU', 'NoV', NR, NR, A, LDA, SVA, + $ N, V, LDV, WORK, LWORK, INFO ) +* + SCALEM = WORK(1) + NUMRANK = IDNINT(WORK(2)) +* +* + ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN +* +* -> Singular Values and Right Singular Vectors <- +* + IF ( ALMORT ) THEN +* +* .. in this case NR equals N + DO 1998 p = 1, NR + CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + 1998 CONTINUE + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) +* + CALL DGESVJ( 'L','U','N', N, NR, V,LDV, SVA, NR, A,LDA, + $ WORK, LWORK, INFO ) + SCALEM = WORK(1) + NUMRANK = IDNINT(WORK(2)) + + ELSE +* +* .. two more QR factorizations ( one QRF is not enough, two require +* accumulated product of Jacobi rotations, three are perfect ) +* + CALL DLASET( 'Lower', NR-1, NR-1, ZERO, ZERO, A(2,1), LDA ) + CALL DGELQF( NR, N, A, LDA, WORK, WORK(N+1), LWORK-N, IERR) + CALL DLACPY( 'Lower', NR, NR, A, LDA, V, LDV ) + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) + CALL DGEQRF( NR, NR, V, LDV, WORK(N+1), WORK(2*N+1), + $ LWORK-2*N, IERR ) + DO 8998 p = 1, NR + CALL DCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 ) + 8998 CONTINUE + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) +* + CALL DGESVJ( 'Lower', 'U','N', NR, NR, V,LDV, SVA, NR, U, + $ LDU, WORK(N+1), LWORK, INFO ) + SCALEM = WORK(N+1) + NUMRANK = IDNINT(WORK(N+2)) + IF ( NR .LT. N ) THEN + CALL DLASET( 'A',N-NR, NR, ZERO,ZERO, V(NR+1,1), LDV ) + CALL DLASET( 'A',NR, N-NR, ZERO,ZERO, V(1,NR+1), LDV ) + CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE, V(NR+1,NR+1), LDV ) + END IF +* + CALL DORMLQ( 'Left', 'Transpose', N, N, NR, A, LDA, WORK, + $ V, LDV, WORK(N+1), LWORK-N, IERR ) +* + END IF +* + DO 8991 p = 1, N + CALL DCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA ) + 8991 CONTINUE + CALL DLACPY( 'All', N, N, A, LDA, V, LDV ) +* + IF ( TRANSP ) THEN + CALL DLACPY( 'All', N, N, V, LDV, U, LDU ) + END IF +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN +* +* .. Singular Values and Left Singular Vectors .. +* +* .. second preconditioning step to avoid need to accumulate +* Jacobi rotations in the Jacobi iterations. + DO 1965 p = 1, NR + CALL DCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 ) + 1965 CONTINUE + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU ) +* + CALL DGEQRF( N, NR, U, LDU, WORK(N+1), WORK(2*N+1), + $ LWORK-2*N, IERR ) +* + DO 1967 p = 1, NR - 1 + CALL DCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 ) + 1967 CONTINUE + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU ) +* + CALL DGESVJ( 'Lower', 'U', 'N', NR,NR, U, LDU, SVA, NR, A, + $ LDA, WORK(N+1), LWORK-N, INFO ) + SCALEM = WORK(N+1) + NUMRANK = IDNINT(WORK(N+2)) +* + IF ( NR .LT. M ) THEN + CALL DLASET( 'A', M-NR, NR,ZERO, ZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL DLASET( 'A',NR, N1-NR, ZERO, ZERO, U(1,NR+1), LDU ) + CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1), LDU ) + END IF + END IF +* + CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 ) +* + DO 1974 p = 1, N1 + XSC = ONE / DNRM2( M, U(1,p), 1 ) + CALL DSCAL( M, XSC, U(1,p), 1 ) + 1974 CONTINUE +* + IF ( TRANSP ) THEN + CALL DLACPY( 'All', N, N, U, LDU, V, LDV ) + END IF +* + ELSE +* +* .. Full SVD .. +* + IF ( .NOT. JRACC ) THEN +* + IF ( .NOT. ALMORT ) THEN +* +* Second Preconditioning Step (QRF [with pivoting]) +* Note that the composition of TRANSPOSE, QRF and TRANSPOSE is +* equivalent to an LQF CALL. Since in many libraries the QRF +* seems to be better optimized than the LQF, we do explicit +* transpose and use the QRF. This is subject to changes in an +* optimized implementation of DGEJSV. +* + DO 1968 p = 1, NR + CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + 1968 CONTINUE +* +* .. the following two loops perturb small entries to avoid +* denormals in the second QR factorization, where they are +* as good as zeros. This is done to avoid painfully slow +* computation with denormals. The relative size of the perturbation +* is a parameter that can be changed by the implementer. +* This perturbation device will be obsolete on machines with +* properly implemented arithmetic. +* To switch it off, set L2PERT=.FALSE. To remove it from the +* code, remove the action under L2PERT=.TRUE., leave the ELSE part. +* The following two loops should be blocked and fused with the +* transposed copy above. +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 2969 q = 1, NR + TEMP1 = XSC*DABS( V(q,q) ) + DO 2968 p = 1, N + IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) + $ V(p,q) = DSIGN( TEMP1, V(p,q) ) + IF ( p .LT. q ) V(p,q) = - V(p,q) + 2968 CONTINUE + 2969 CONTINUE + ELSE + CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) + END IF +* +* Estimate the row scaled condition number of R1 +* (If R1 is rectangular, N > NR, then the condition number +* of the leading NR x NR submatrix is estimated.) +* + CALL DLACPY( 'L', NR, NR, V, LDV, WORK(2*N+1), NR ) + DO 3950 p = 1, NR + TEMP1 = DNRM2(NR-p+1,WORK(2*N+(p-1)*NR+p),1) + CALL DSCAL(NR-p+1,ONE/TEMP1,WORK(2*N+(p-1)*NR+p),1) + 3950 CONTINUE + CALL DPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1, + $ WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR) + CONDR1 = ONE / DSQRT(TEMP1) +* .. here need a second opinion on the condition number +* .. then assume worst case scenario +* R1 is OK for inverse <=> CONDR1 .LT. DBLE(N) +* more conservative <=> CONDR1 .LT. DSQRT(DBLE(N)) +* + COND_OK = DSQRT(DBLE(NR)) +*[TP] COND_OK is a tuning parameter. + + IF ( CONDR1 .LT. COND_OK ) THEN +* .. the second QRF without pivoting. Note: in an optimized +* implementation, this QRF should be implemented as the QRF +* of a lower triangular matrix. +* R1^t = Q2 * R2 + CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1), + $ LWORK-2*N, IERR ) +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL)/EPSLN + DO 3959 p = 2, NR + DO 3958 q = 1, p - 1 + TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q))) + IF ( DABS(V(q,p)) .LE. TEMP1 ) + $ V(q,p) = DSIGN( TEMP1, V(q,p) ) + 3958 CONTINUE + 3959 CONTINUE + END IF +* + IF ( NR .NE. N ) + $ CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N ) +* .. save ... +* +* .. this transposed copy should be better than naive + DO 1969 p = 1, NR - 1 + CALL DCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 ) + 1969 CONTINUE +* + CONDR2 = CONDR1 +* + ELSE +* +* .. ill-conditioned case: second QRF with pivoting +* Note that windowed pivoting would be equally good +* numerically, and more run-time efficient. So, in +* an optimal implementation, the next call to DGEQP3 +* should be replaced with eg. CALL SGEQPX (ACM TOMS #782) +* with properly (carefully) chosen parameters. +* +* R1^t * P2 = Q2 * R2 + DO 3003 p = 1, NR + IWORK(N+p) = 0 + 3003 CONTINUE + CALL DGEQP3( N, NR, V, LDV, IWORK(N+1), WORK(N+1), + $ WORK(2*N+1), LWORK-2*N, IERR ) +** CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1), +** $ LWORK-2*N, IERR ) + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 3969 p = 2, NR + DO 3968 q = 1, p - 1 + TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q))) + IF ( DABS(V(q,p)) .LE. TEMP1 ) + $ V(q,p) = DSIGN( TEMP1, V(q,p) ) + 3968 CONTINUE + 3969 CONTINUE + END IF +* + CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N ) +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 8970 p = 2, NR + DO 8971 q = 1, p - 1 + TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q))) + V(p,q) = - DSIGN( TEMP1, V(q,p) ) + 8971 CONTINUE + 8970 CONTINUE + ELSE + CALL DLASET( 'L',NR-1,NR-1,ZERO,ZERO,V(2,1),LDV ) + END IF +* Now, compute R2 = L3 * Q3, the LQ factorization. + CALL DGELQF( NR, NR, V, LDV, WORK(2*N+N*NR+1), + $ WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR ) +* .. and estimate the condition number + CALL DLACPY( 'L',NR,NR,V,LDV,WORK(2*N+N*NR+NR+1),NR ) + DO 4950 p = 1, NR + TEMP1 = DNRM2( p, WORK(2*N+N*NR+NR+p), NR ) + CALL DSCAL( p, ONE/TEMP1, WORK(2*N+N*NR+NR+p), NR ) + 4950 CONTINUE + CALL DPOCON( 'L',NR,WORK(2*N+N*NR+NR+1),NR,ONE,TEMP1, + $ WORK(2*N+N*NR+NR+NR*NR+1),IWORK(M+2*N+1),IERR ) + CONDR2 = ONE / DSQRT(TEMP1) +* + IF ( CONDR2 .GE. COND_OK ) THEN +* .. save the Householder vectors used for Q3 +* (this overwrites the copy of R2, as it will not be +* needed in this branch, but it does not overwritte the +* Huseholder vectors of Q2.). + CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N ) +* .. and the rest of the information on Q3 is in +* WORK(2*N+N*NR+1:2*N+N*NR+N) + END IF +* + END IF +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 4968 q = 2, NR + TEMP1 = XSC * V(q,q) + DO 4969 p = 1, q - 1 +* V(p,q) = - DSIGN( TEMP1, V(q,p) ) + V(p,q) = - DSIGN( TEMP1, V(p,q) ) + 4969 CONTINUE + 4968 CONTINUE + ELSE + CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV ) + END IF +* +* Second preconditioning finished; continue with Jacobi SVD +* The input matrix is lower trinagular. +* +* Recover the right singular vectors as solution of a well +* conditioned triangular matrix equation. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* + CALL DGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, + $ LDU,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,INFO ) + SCALEM = WORK(2*N+N*NR+NR+1) + NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2)) + DO 3970 p = 1, NR + CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL DSCAL( NR, SVA(p), V(1,p), 1 ) + 3970 CONTINUE + +* .. pick the right matrix equation and solve it +* + IF ( NR .EQ. N ) THEN +* :)) .. best case, R1 is inverted. The solution of this matrix +* equation is Q2*V2 = the product of the Jacobi rotations +* used in DGESVJ, premultiplied with the orthogonal matrix +* from the second QR factorization. + CALL DTRSM( 'L','U','N','N', NR,NR,ONE, A,LDA, V,LDV ) + ELSE +* .. R1 is well conditioned, but non-square. Transpose(R2) +* is inverted to get the product of the Jacobi rotations +* used in DGESVJ. The Q-factor from the second QR +* factorization is then built in explicitly. + CALL DTRSM('L','U','T','N',NR,NR,ONE,WORK(2*N+1), + $ N,V,LDV) + IF ( NR .LT. N ) THEN + CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV) + CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV) + CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV) + END IF + CALL DORMQR('L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1), + $ V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR) + END IF +* + ELSE IF ( CONDR2 .LT. COND_OK ) THEN +* +* :) .. the input matrix A is very likely a relative of +* the Kahan matrix :) +* The matrix R2 is inverted. The solution of the matrix equation +* is Q3^T*V3 = the product of the Jacobi rotations (appplied to +* the lower triangular L3 from the LQ factorization of +* R2=L3*Q3), pre-multiplied with the transposed Q3. + CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, + $ LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO ) + SCALEM = WORK(2*N+N*NR+NR+1) + NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2)) + DO 3870 p = 1, NR + CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL DSCAL( NR, SVA(p), U(1,p), 1 ) + 3870 CONTINUE + CALL DTRSM('L','U','N','N',NR,NR,ONE,WORK(2*N+1),N,U,LDU) +* .. apply the permutation from the second QR factorization + DO 873 q = 1, NR + DO 872 p = 1, NR + WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 872 CONTINUE + DO 874 p = 1, NR + U(p,q) = WORK(2*N+N*NR+NR+p) + 874 CONTINUE + 873 CONTINUE + IF ( NR .LT. N ) THEN + CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV ) + CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV ) + CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV ) + END IF + CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1), + $ V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) + ELSE +* Last line of defense. +* #:( This is a rather pathological case: no scaled condition +* improvement after two pivoted QR factorizations. Other +* possibility is that the rank revealing QR factorization +* or the condition estimator has failed, or the COND_OK +* is set very close to ONE (which is unnecessary). Normally, +* this branch should never be executed, but in rare cases of +* failure of the RRQR or condition estimator, the last line of +* defense ensures that DGEJSV completes the task. +* Compute the full SVD of L3 using DGESVJ with explicit +* accumulation of Jacobi rotations. + CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U, + $ LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO ) + SCALEM = WORK(2*N+N*NR+NR+1) + NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2)) + IF ( NR .LT. N ) THEN + CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV ) + CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV ) + CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV ) + END IF + CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1), + $ V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* + CALL DORMLQ( 'L', 'T', NR, NR, NR, WORK(2*N+1), N, + $ WORK(2*N+N*NR+1), U, LDU, WORK(2*N+N*NR+NR+1), + $ LWORK-2*N-N*NR-NR, IERR ) + DO 773 q = 1, NR + DO 772 p = 1, NR + WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 772 CONTINUE + DO 774 p = 1, NR + U(p,q) = WORK(2*N+N*NR+NR+p) + 774 CONTINUE + 773 CONTINUE +* + END IF +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = DSQRT(DBLE(N)) * EPSLN + DO 1972 q = 1, N + DO 972 p = 1, N + WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 972 CONTINUE + DO 973 p = 1, N + V(p,q) = WORK(2*N+N*NR+NR+p) + 973 CONTINUE + XSC = ONE / DNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( N, XSC, V(1,q), 1 ) + 1972 CONTINUE +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). + IF ( NR .LT. M ) THEN + CALL DLASET( 'A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL DLASET('A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1),LDU) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* matrix U. This applies to all cases. +* + CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) + +* The columns of U are normalized. The cost is O(M*N) flops. + TEMP1 = DSQRT(DBLE(M)) * EPSLN + DO 1973 p = 1, NR + XSC = ONE / DNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( M, XSC, U(1,p), 1 ) + 1973 CONTINUE +* +* If the initial QRF is computed with row pivoting, the left +* singular vectors must be adjusted. +* + IF ( ROWPIV ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 ) +* + ELSE +* +* .. the initial matrix A has almost orthogonal columns and +* the second QRF is not needed +* + CALL DLACPY( 'Upper', N, N, A, LDA, WORK(N+1), N ) + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 5970 p = 2, N + TEMP1 = XSC * WORK( N + (p-1)*N + p ) + DO 5971 q = 1, p - 1 + WORK(N+(q-1)*N+p)=-DSIGN(TEMP1,WORK(N+(p-1)*N+q)) + 5971 CONTINUE + 5970 CONTINUE + ELSE + CALL DLASET( 'Lower',N-1,N-1,ZERO,ZERO,WORK(N+2),N ) + END IF +* + CALL DGESVJ( 'Upper', 'U', 'N', N, N, WORK(N+1), N, SVA, + $ N, U, LDU, WORK(N+N*N+1), LWORK-N-N*N, INFO ) +* + SCALEM = WORK(N+N*N+1) + NUMRANK = IDNINT(WORK(N+N*N+2)) + DO 6970 p = 1, N + CALL DCOPY( N, WORK(N+(p-1)*N+1), 1, U(1,p), 1 ) + CALL DSCAL( N, SVA(p), WORK(N+(p-1)*N+1), 1 ) + 6970 CONTINUE +* + CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N, + $ ONE, A, LDA, WORK(N+1), N ) + DO 6972 p = 1, N + CALL DCOPY( N, WORK(N+p), N, V(IWORK(p),1), LDV ) + 6972 CONTINUE + TEMP1 = DSQRT(DBLE(N))*EPSLN + DO 6971 p = 1, N + XSC = ONE / DNRM2( N, V(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( N, XSC, V(1,p), 1 ) + 6971 CONTINUE +* +* Assemble the left singular vector matrix U (M x N). +* + IF ( N .LT. M ) THEN + CALL DLASET( 'A', M-N, N, ZERO, ZERO, U(N+1,1), LDU ) + IF ( N .LT. N1 ) THEN + CALL DLASET( 'A',N, N1-N, ZERO, ZERO, U(1,N+1),LDU ) + CALL DLASET( 'A',M-N,N1-N, ZERO, ONE,U(N+1,N+1),LDU ) + END IF + END IF + CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) + TEMP1 = DSQRT(DBLE(M))*EPSLN + DO 6973 p = 1, N1 + XSC = ONE / DNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( M, XSC, U(1,p), 1 ) + 6973 CONTINUE +* + IF ( ROWPIV ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 ) +* + END IF +* +* end of the >> almost orthogonal case << in the full SVD +* + ELSE +* +* This branch deploys a preconditioned Jacobi SVD with explicitly +* accumulated rotations. It is included as optional, mainly for +* experimental purposes. It does perform well, and can also be used. +* In this implementation, this branch will be automatically activated +* if the condition number sigma_max(A) / sigma_min(A) is predicted +* to be greater than the overflow threshold. This is because the +* a posteriori computation of the singular vectors assumes robust +* implementation of BLAS and some LAPACK procedures, capable of working +* in presence of extreme values. Since that is not always the case, ... +* + DO 7968 p = 1, NR + CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + 7968 CONTINUE +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL/EPSLN) + DO 5969 q = 1, NR + TEMP1 = XSC*DABS( V(q,q) ) + DO 5968 p = 1, N + IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) + $ V(p,q) = DSIGN( TEMP1, V(p,q) ) + IF ( p .LT. q ) V(p,q) = - V(p,q) + 5968 CONTINUE + 5969 CONTINUE + ELSE + CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) + END IF + + CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1), + $ LWORK-2*N, IERR ) + CALL DLACPY( 'L', N, NR, V, LDV, WORK(2*N+1), N ) +* + DO 7969 p = 1, NR + CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 ) + 7969 CONTINUE + + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL/EPSLN) + DO 9970 q = 2, NR + DO 9971 p = 1, q - 1 + TEMP1 = XSC * MIN(DABS(U(p,p)),DABS(U(q,q))) + U(p,q) = - DSIGN( TEMP1, U(q,p) ) + 9971 CONTINUE + 9970 CONTINUE + ELSE + CALL DLASET('U', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU ) + END IF + + CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA, + $ N, V, LDV, WORK(2*N+N*NR+1), LWORK-2*N-N*NR, INFO ) + SCALEM = WORK(2*N+N*NR+1) + NUMRANK = IDNINT(WORK(2*N+N*NR+2)) + + IF ( NR .LT. N ) THEN + CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV ) + CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV ) + CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV ) + END IF + + CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1), + $ V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = DSQRT(DBLE(N)) * EPSLN + DO 7972 q = 1, N + DO 8972 p = 1, N + WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 8972 CONTINUE + DO 8973 p = 1, N + V(p,q) = WORK(2*N+N*NR+NR+p) + 8973 CONTINUE + XSC = ONE / DNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( N, XSC, V(1,q), 1 ) + 7972 CONTINUE +* +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). +* + IF ( NR .LT. M ) THEN + CALL DLASET( 'A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL DLASET( 'A',NR, N1-NR, ZERO, ZERO, U(1,NR+1),LDU ) + CALL DLASET( 'A',M-NR,N1-NR, ZERO, ONE,U(NR+1,NR+1),LDU ) + END IF + END IF +* + CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 ) +* +* + END IF + IF ( TRANSP ) THEN +* .. swap U and V because the procedure worked on A^t + DO 6974 p = 1, N + CALL DSWAP( N, U(1,p), 1, V(1,p), 1 ) + 6974 CONTINUE + END IF +* + END IF +* end of the full SVD +* +* Undo scaling, if necessary (and possible) +* + IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN + CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR ) + USCAL1 = ONE + USCAL2 = ONE + END IF +* + IF ( NR .LT. N ) THEN + DO 3004 p = NR+1, N + SVA(p) = ZERO + 3004 CONTINUE + END IF +* + WORK(1) = USCAL2 * SCALEM + WORK(2) = USCAL1 + IF ( ERREST ) WORK(3) = SCONDA + IF ( LSVEC .AND. RSVEC ) THEN + WORK(4) = CONDR1 + WORK(5) = CONDR2 + END IF + IF ( L2TRAN ) THEN + WORK(6) = ENTRA + WORK(7) = ENTRAT + END IF +* + IWORK(1) = NR + IWORK(2) = NUMRANK + IWORK(3) = WARNING +* + RETURN +* .. +* .. END OF DGEJSV +* .. + END +* diff --git a/lapack-netlib/dgesvx.f b/lapack-netlib/dgesvx.f new file mode 100644 index 000000000..f787488dc --- /dev/null +++ b/lapack-netlib/dgesvx.f @@ -0,0 +1,599 @@ +*> \brief DGESVX computes the solution to system of linear equations A * X = B for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGESVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, +* EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, +* WORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS +* DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ), IWORK( * ) +* DOUBLE PRECISION A( LDA, * ), AF( LDAF, * ), B( LDB, * ), +* $ BERR( * ), C( * ), FERR( * ), R( * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGESVX uses the LU factorization to compute the solution to a real +*> system of linear equations +*> A * X = B, +*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = P * L * U, +*> where P is a permutation matrix, L is a unit lower triangular +*> matrix, and U is upper triangular. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AF and IPIV contain the factored form of A. +*> If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> A, AF, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AF and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AF and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations: +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the N-by-N matrix A. If FACT = 'F' and EQUED is +*> not 'N', then A must have been equilibrated by the scaling +*> factors in R and/or C. A is not modified if FACT = 'F' or +*> 'N', or if FACT = 'E' and EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] AF +*> \verbatim +*> AF is DOUBLE PRECISION array, dimension (LDAF,N) +*> If FACT = 'F', then AF is an input argument and on entry +*> contains the factors L and U from the factorization +*> A = P*L*U as computed by DGETRF. If EQUED .ne. 'N', then +*> AF is the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the equilibrated matrix A (see the description of A for +*> the form of the equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAF +*> \verbatim +*> LDAF is INTEGER +*> The leading dimension of the array AF. LDAF >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = P*L*U +*> as computed by DGETRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is DOUBLE PRECISION array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,NRHS) +*> On entry, the N-by-NRHS right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is DOUBLE PRECISION +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is DOUBLE PRECISION array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is DOUBLE PRECISION array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,4*N)) +*> On exit, WORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If WORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 WORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization has +*> been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGEsolve +* +* ===================================================================== + SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, + $ EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, + $ WORK, IWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS + DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ), IWORK( * ) + DOUBLE PRECISION A( LDA, * ), AF( LDAF, * ), B( LDB, * ), + $ BERR( * ), C( * ), FERR( * ), R( * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J + DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DLAMCH, DLANGE, DLANTR + EXTERNAL LSAME, DLAMCH, DLANGE, DLANTR +* .. +* .. External Subroutines .. + EXTERNAL DGECON, DGEEQU, DGERFS, DGETRF, DGETRS, DLACPY, + $ DLAQGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = DLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -10 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -11 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -12 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -14 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -16 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGESVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL DGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL DLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, + $ EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of A. +* + CALL DLACPY( 'Full', N, N, A, LDA, AF, LDAF ) + CALL DGETRF( N, N, AF, LDAF, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + RPVGRW = DLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF, + $ WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = DLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW + END IF + WORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = DLANGE( NORM, N, N, A, LDA, WORK ) + RPVGRW = DLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = DLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL DGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL DGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL DGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X, + $ LDX, FERR, BERR, WORK, IWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 80 J = 1, NRHS + DO 70 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 70 CONTINUE + 80 CONTINUE + DO 90 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 90 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 120 CONTINUE + END IF +* + WORK( 1 ) = RPVGRW +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.DLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 + RETURN +* +* End of DGESVX +* + END diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 5582744a0..fea0ff765 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -1,31 +1,29 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- +#!/usr/bin/env python3 ############################################################################### # lapack_testing.py ############################################################################### -from __future__ import print_function from subprocess import Popen, STDOUT, PIPE import os, sys, math import getopt # Arguments try: opts, args = getopt.getopt(sys.argv[1:], "hd:b:srep:t:n", - ["help", "dir", "bin", "short", "run", "error","prec=","test=","number"]) + ["help", "dir=", "bin=", "short", "run", "error","prec=","test=","number"]) except getopt.error as msg: print(msg) print("for help use --help") sys.exit(2) -short_summary=0 -with_file=1 -just_errors = 0 +short_summary = False +with_file = True +just_errors = False prec='x' test='all' -only_numbers=0 +only_numbers = False test_dir='TESTING' bin_dir='bin/Release' @@ -34,10 +32,9 @@ for o, a in opts: print(sys.argv[0]+" [-h|--help] [-d dir |--dir dir] [-s |--short] [-r |--run] [-e |--error] [-p p |--prec p] [-t test |--test test] [-n | --number]") print(" - h is to print this message") print(" - r is to use to run the LAPACK tests then analyse the output (.out files). By default, the script will not run all the LAPACK tests") - print(" - d [dir] is to indicate where is the LAPACK testing directory (.out files). By default, the script will use .") - print(" - b [bin] is to indicate where is the LAPACK binary files are located. By default, the script will use .") + print(" - d [dir] indicates the location of the LAPACK testing directory (.out files). By default, the script will use {:s}.".format(test_dir)) + print(" - b [bin] indicates the location of the LAPACK binary files. By default, the script will use {:s}.".format(bin_dir)) print(" LEVEL OF OUTPUT") - print(" - x is to print a detailed summary") print(" - e is to print only the error summary") print(" - s is to print a short summary") print(" - n is to print the numbers of failing tests (turn on summary mode)") @@ -63,15 +60,14 @@ for o, a in opts: print(" Will return the numbers of failed tests in REAL precision by running the LAPACK Tests then analyzing the output") print(" ./lapack_testing.py -n -p s -t eig ") print(" Will return the numbers of failed tests in REAL precision by analyzing only the LAPACK output of EIGEN testings") - print("Written by Julie Langou (June 2011) ") sys.exit(0) else: if o in ("-s", "--short"): - short_summary = 1 + short_summary = True if o in ("-r", "--run"): - with_file = 0 + with_file = False if o in ("-e", "--error"): - just_errors = 1 + just_errors = True if o in ( '-p', '--prec' ): prec = a if o in ( '-b', '--bin' ): @@ -81,12 +77,12 @@ for o, a in opts: if o in ( '-t', '--test' ): test = a if o in ( '-n', '--number' ): - only_numbers = 1 - short_summary = 1 + only_numbers = True + short_summary = True # process options -abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir)) +abs_bin_dir=os.path.abspath(bin_dir) os.chdir(test_dir) @@ -108,7 +104,7 @@ def run_summary_test( f, cmdline, short_summary): nb_test_illegal=0 nb_test_info=0 - if (with_file): + if with_file: if not os.path.exists(cmdline): error_message=cmdline+" file not found" r=1 @@ -140,21 +136,21 @@ def run_summary_test( f, cmdline, short_summary): for line in pipe.readlines(): f.write(str(line)) words_in_line=line.split() - if (line.find("run")!=-1): + if (line.find("run)")!=-1): # print line whereisrun=words_in_line.index("run)") nb_test_run+=int(words_in_line[whereisrun-2]) if (line.find("out of")!=-1): - if (short_summary==0): print(line, end=' ') + if not short_summary: print(line, end=' ') whereisout= words_in_line.index("out") nb_test_fail+=int(words_in_line[whereisout-1]) if ((line.find("illegal")!=-1) or (line.find("Illegal")!=-1)): - if (short_summary==0):print(line, end=' ') + if not short_summary: print(line, end=' ') nb_test_illegal+=1 if (line.find(" INFO")!=-1): - if (short_summary==0):print(line, end=' ') + if not short_summary: print(line, end=' ') nb_test_info+=1 - if (with_file==1): + if with_file: pipe.close() f.flush(); @@ -169,7 +165,7 @@ try: except IOError: f = sys.stdout -if (short_summary==0): +if not short_summary: print(" ") print("---------------- Testing LAPACK Routines ----------------") print(" ") @@ -203,6 +199,8 @@ elif test=='mixed': range_prec=[1,3] elif test=='rfp': range_test=[18] +elif test=='dmd': + range_test=[20] elif test=='eig': range_test=list(range(16)) else: @@ -219,7 +217,7 @@ for dtype in range_prec: letter = dtypes[0][dtype] name = dtypes[1][dtype] - if (short_summary==0): + if not short_summary: print(" ") print("------------------------- %s ------------------------" % name) print(" ") @@ -231,19 +229,19 @@ for dtype in range_prec: letter+"gd",letter+"sb",letter+"sg", letter+"bb","glm","gqr", "gsv","csd","lse", - letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"), + letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"), ("Nonsymmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem-2-stage", "Singular-Value-Decomposition", "Eigen-Condition","Nonsymmetric-Eigenvalue","Nonsymmetric-Generalized-Eigenvalue-Problem", "Nonsymmetric-Generalized-Eigenvalue-Problem-driver", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Generalized-Problem", "Banded-Singular-Value-Decomposition-routines", "Generalized-Linear-Regression-Model-routines", "Generalized-QR-and-RQ-factorization-routines", "Generalized-Singular-Value-Decomposition-routines", "CS-Decomposition-routines", "Constrained-Linear-Least-Squares-routines", - "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines"), + "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines","Dynamic-Mode-Decomposition"), (letter+"nep", letter+"sep", letter+"se2", letter+"svd", letter+"ec",letter+"ed",letter+"gg", letter+"gd",letter+"sb",letter+"sg", letter+"bb",letter+"glm",letter+"gqr", letter+"gsv",letter+"csd",letter+"lse", - letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"), + letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"), ) @@ -252,7 +250,7 @@ for dtype in range_prec: # NEED TO SKIP SOME PRECISION (namely s and c) FOR PROTO MIXED PRECISION TESTING if dtest==17 and (letter=="s" or letter=="c"): continue - if (with_file==1): + if with_file: cmdbase=dtests[2][dtest]+".out" else: if dtest==16: @@ -264,10 +262,13 @@ for dtype in range_prec: elif dtest==18: # PROTO LIN TESTS cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + elif dtest==20: + # DMD EIG TESTS + cmdbase="EIG/xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" - if (not just_errors and not short_summary): + if not just_errors and not short_summary: print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing nb_test = run_summary_test(f, cmdbase, short_summary) @@ -277,19 +278,19 @@ for dtype in range_prec: list_results[3][dtype]+=nb_test[3] got_error=nb_test[1]+nb_test[2]+nb_test[3] - if (not short_summary): - if (nb_test[0]>0 and just_errors==0): + if not short_summary: + if nb_test[0] > 0 and not just_errors: print("passed: "+str(nb_test[0])) - if (nb_test[1]>0): + if nb_test[1] > 0: print("failing to pass the threshold: "+str(nb_test[1])) - if (nb_test[2]>0): + if nb_test[2] > 0: print("Illegal Error: "+str(nb_test[2])) - if (nb_test[3]>0): + if nb_test[3] > 0: print("Info Error: "+str(nb_test[3])) - if (got_error>0 and just_errors==1): + if got_error > 0 and just_errors: print("ERROR IS LOCATED IN "+name+" "+dtests[1][dtest]+" [ "+cmdbase+" ]") print("") - if (just_errors==0): + if not just_errors: print("") # elif (got_error>0): # print dtests[2][dtest]+".out \t"+str(nb_test[1])+"\t"+str(nb_test[2])+"\t"+str(nb_test[3]) @@ -307,7 +308,7 @@ for dtype in range_prec: list_results[2][4]+=list_results[2][dtype] list_results[3][4]+=list_results[3][dtype] -if only_numbers==1: +if only_numbers: print(str(list_results[1][4])+"\n"+str(list_results[2][4]+list_results[3][4])) else: print(summary) diff --git a/lapack-netlib/sgbsvx.f b/lapack-netlib/sgbsvx.f new file mode 100644 index 000000000..df3a721d9 --- /dev/null +++ b/lapack-netlib/sgbsvx.f @@ -0,0 +1,641 @@ +*> \brief SGBSVX computes the solution to system of linear equations A * X = B for GB matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGBSVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, +* LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, +* RCOND, FERR, BERR, WORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS +* REAL RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ), IWORK( * ) +* REAL AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), +* $ BERR( * ), C( * ), FERR( * ), R( * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGBSVX uses the LU factorization to compute the solution to a real +*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B, +*> where A is a band matrix of order N with KL subdiagonals and KU +*> superdiagonals, and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed by this subroutine: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = L * U, +*> where L is a product of permutation and unit lower triangular +*> matrices with KL subdiagonals, and U is upper triangular with +*> KL+KU superdiagonals. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AFB and IPIV contain the factored form of +*> A. If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> AB, AFB, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AFB and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AFB and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations. +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KL +*> \verbatim +*> KL is INTEGER +*> The number of subdiagonals within the band of A. KL >= 0. +*> \endverbatim +*> +*> \param[in] KU +*> \verbatim +*> KU is INTEGER +*> The number of superdiagonals within the band of A. KU >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is REAL array, dimension (LDAB,N) +*> On entry, the matrix A in band storage, in rows 1 to KL+KU+1. +*> The j-th column of A is stored in the j-th column of the +*> array AB as follows: +*> AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl) +*> +*> If FACT = 'F' and EQUED is not 'N', then A must have been +*> equilibrated by the scaling factors in R and/or C. AB is not +*> modified if FACT = 'F' or 'N', or if FACT = 'E' and +*> EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] AFB +*> \verbatim +*> AFB is REAL array, dimension (LDAFB,N) +*> If FACT = 'F', then AFB is an input argument and on entry +*> contains details of the LU factorization of the band matrix +*> A, as computed by SGBTRF. U is stored as an upper triangular +*> band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1, +*> and the multipliers used during the factorization are stored +*> in rows KL+KU+2 to 2*KL+KU+1. If EQUED .ne. 'N', then AFB is +*> the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AFB is an output argument and on exit +*> returns details of the LU factorization of A. +*> +*> If FACT = 'E', then AFB is an output argument and on exit +*> returns details of the LU factorization of the equilibrated +*> matrix A (see the description of AB for the form of the +*> equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAFB +*> \verbatim +*> LDAFB is INTEGER +*> The leading dimension of the array AFB. LDAFB >= 2*KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = L*U +*> as computed by SGBTRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is REAL array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,NRHS) +*> On entry, the right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is REAL +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is REAL array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is REAL array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,3*N)) +*> On exit, WORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If WORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 WORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization +*> has been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup realGBsolve +* +* ===================================================================== + SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, + $ LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, + $ RCOND, FERR, BERR, WORK, IWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS + REAL RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ), IWORK( * ) + REAL AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), + $ BERR( * ), C( * ), FERR( * ), R( * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* Moved setting of INFO = N+1 so INFO does not subsequently get +* overwritten. Sven, 17 Mar 05. +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J, J1, J2 + REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + REAL SLAMCH, SLANGB, SLANTB + EXTERNAL LSAME, SLAMCH, SLANGB, SLANTB +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SGBCON, SGBEQU, SGBRFS, SGBTRF, SGBTRS, + $ SLACPY, SLAQGB, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = SLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KL.LT.0 ) THEN + INFO = -4 + ELSE IF( KU.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDAB.LT.KL+KU+1 ) THEN + INFO = -8 + ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN + INFO = -10 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -12 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -13 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -14 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -16 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -18 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGBSVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL SGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL SLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of the band matrix A. +* + DO 70 J = 1, N + J1 = MAX( J-KU, 1 ) + J2 = MIN( J+KL, N ) + CALL SCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1, + $ AFB( KL+KU+1-J+J1, J ), 1 ) + 70 CONTINUE +* + CALL SGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + ANORM = ZERO + DO 90 J = 1, INFO + DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 ) + ANORM = MAX( ANORM, ABS( AB( I, J ) ) ) + 80 CONTINUE + 90 CONTINUE + RPVGRW = SLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ), + $ AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB, + $ WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ANORM / RPVGRW + END IF + WORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = SLANGB( NORM, N, KL, KU, AB, LDAB, WORK ) + RPVGRW = SLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = SLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL SGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND, + $ WORK, IWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL SGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX, + $ INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL SGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV, + $ B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 120 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 140 J = 1, NRHS + DO 130 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 130 CONTINUE + 140 CONTINUE + DO 150 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 150 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + WORK( 1 ) = RPVGRW + RETURN +* +* End of SGBSVX +* + END diff --git a/lapack-netlib/sgesvx.f b/lapack-netlib/sgesvx.f new file mode 100644 index 000000000..385e626cf --- /dev/null +++ b/lapack-netlib/sgesvx.f @@ -0,0 +1,599 @@ +*> \brief SGESVX computes the solution to system of linear equations A * X = B for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGESVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, +* EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, +* WORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS +* REAL RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ), IWORK( * ) +* REAL A( LDA, * ), AF( LDAF, * ), B( LDB, * ), +* $ BERR( * ), C( * ), FERR( * ), R( * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGESVX uses the LU factorization to compute the solution to a real +*> system of linear equations +*> A * X = B, +*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = P * L * U, +*> where P is a permutation matrix, L is a unit lower triangular +*> matrix, and U is upper triangular. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AF and IPIV contain the factored form of A. +*> If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> A, AF, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AF and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AF and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations: +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the N-by-N matrix A. If FACT = 'F' and EQUED is +*> not 'N', then A must have been equilibrated by the scaling +*> factors in R and/or C. A is not modified if FACT = 'F' or +*> 'N', or if FACT = 'E' and EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] AF +*> \verbatim +*> AF is REAL array, dimension (LDAF,N) +*> If FACT = 'F', then AF is an input argument and on entry +*> contains the factors L and U from the factorization +*> A = P*L*U as computed by SGETRF. If EQUED .ne. 'N', then +*> AF is the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the equilibrated matrix A (see the description of A for +*> the form of the equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAF +*> \verbatim +*> LDAF is INTEGER +*> The leading dimension of the array AF. LDAF >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = P*L*U +*> as computed by SGETRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is REAL array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,NRHS) +*> On entry, the N-by-NRHS right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is REAL +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is REAL array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is REAL array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,4*N)) +*> On exit, WORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If WORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 WORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization has +*> been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup realGEsolve +* +* ===================================================================== + SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, + $ EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, + $ WORK, IWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS + REAL RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ), IWORK( * ) + REAL A( LDA, * ), AF( LDAF, * ), B( LDB, * ), + $ BERR( * ), C( * ), FERR( * ), R( * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J + REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + REAL SLAMCH, SLANGE, SLANTR + EXTERNAL LSAME, SLAMCH, SLANGE, SLANTR +* .. +* .. External Subroutines .. + EXTERNAL SGECON, SGEEQU, SGERFS, SGETRF, SGETRS, SLACPY, + $ SLAQGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = SLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -10 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -11 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -12 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -14 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -16 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGESVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL SGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL SLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, + $ EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of A. +* + CALL SLACPY( 'Full', N, N, A, LDA, AF, LDAF ) + CALL SGETRF( N, N, AF, LDAF, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + RPVGRW = SLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF, + $ WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = SLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW + END IF + WORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = SLANGE( NORM, N, N, A, LDA, WORK ) + RPVGRW = SLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = SLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL SGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL SGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL SGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X, + $ LDX, FERR, BERR, WORK, IWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 80 J = 1, NRHS + DO 70 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 70 CONTINUE + 80 CONTINUE + DO 90 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 90 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 120 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + WORK( 1 ) = RPVGRW + RETURN +* +* End of SGESVX +* + END diff --git a/lapack-netlib/zgbsvx.f b/lapack-netlib/zgbsvx.f new file mode 100644 index 000000000..871564a81 --- /dev/null +++ b/lapack-netlib/zgbsvx.f @@ -0,0 +1,644 @@ +*> \brief ZGBSVX computes the solution to system of linear equations A * X = B for GB matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGBSVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, +* LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, +* RCOND, FERR, BERR, WORK, RWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS +* DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ) +* DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), +* $ RWORK( * ) +* COMPLEX*16 AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGBSVX uses the LU factorization to compute the solution to a complex +*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B, +*> where A is a band matrix of order N with KL subdiagonals and KU +*> superdiagonals, and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed by this subroutine: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = L * U, +*> where L is a product of permutation and unit lower triangular +*> matrices with KL subdiagonals, and U is upper triangular with +*> KL+KU superdiagonals. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AFB and IPIV contain the factored form of +*> A. If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> AB, AFB, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AFB and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AFB and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations. +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KL +*> \verbatim +*> KL is INTEGER +*> The number of subdiagonals within the band of A. KL >= 0. +*> \endverbatim +*> +*> \param[in] KU +*> \verbatim +*> KU is INTEGER +*> The number of superdiagonals within the band of A. KU >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX*16 array, dimension (LDAB,N) +*> On entry, the matrix A in band storage, in rows 1 to KL+KU+1. +*> The j-th column of A is stored in the j-th column of the +*> array AB as follows: +*> AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl) +*> +*> If FACT = 'F' and EQUED is not 'N', then A must have been +*> equilibrated by the scaling factors in R and/or C. AB is not +*> modified if FACT = 'F' or 'N', or if FACT = 'E' and +*> EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] AFB +*> \verbatim +*> AFB is COMPLEX*16 array, dimension (LDAFB,N) +*> If FACT = 'F', then AFB is an input argument and on entry +*> contains details of the LU factorization of the band matrix +*> A, as computed by ZGBTRF. U is stored as an upper triangular +*> band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1, +*> and the multipliers used during the factorization are stored +*> in rows KL+KU+2 to 2*KL+KU+1. If EQUED .ne. 'N', then AFB is +*> the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AFB is an output argument and on exit +*> returns details of the LU factorization of A. +*> +*> If FACT = 'E', then AFB is an output argument and on exit +*> returns details of the LU factorization of the equilibrated +*> matrix A (see the description of AB for the form of the +*> equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAFB +*> \verbatim +*> LDAFB is INTEGER +*> The leading dimension of the array AFB. LDAFB >= 2*KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = L*U +*> as computed by ZGBTRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is DOUBLE PRECISION array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,NRHS) +*> On entry, the right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is DOUBLE PRECISION +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is DOUBLE PRECISION array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is DOUBLE PRECISION array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,N)) +*> On exit, RWORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If RWORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 RWORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization +*> has been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GBsolve +* +* ===================================================================== + SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, + $ LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, + $ RCOND, FERR, BERR, WORK, RWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS + DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), + $ RWORK( * ) + COMPLEX*16 AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* Moved setting of INFO = N+1 so INFO does not subsequently get +* overwritten. Sven, 17 Mar 05. +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J, J1, J2 + DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DLAMCH, ZLANGB, ZLANTB + EXTERNAL LSAME, DLAMCH, ZLANGB, ZLANTB +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZCOPY, ZGBCON, ZGBEQU, ZGBRFS, ZGBTRF, + $ ZGBTRS, ZLACPY, ZLAQGB +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = DLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KL.LT.0 ) THEN + INFO = -4 + ELSE IF( KU.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDAB.LT.KL+KU+1 ) THEN + INFO = -8 + ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN + INFO = -10 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -12 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -13 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -14 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -16 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -18 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGBSVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL ZGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL ZLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of the band matrix A. +* + DO 70 J = 1, N + J1 = MAX( J-KU, 1 ) + J2 = MIN( J+KL, N ) + CALL ZCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1, + $ AFB( KL+KU+1-J+J1, J ), 1 ) + 70 CONTINUE +* + CALL ZGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + ANORM = ZERO + DO 90 J = 1, INFO + DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 ) + ANORM = MAX( ANORM, ABS( AB( I, J ) ) ) + 80 CONTINUE + 90 CONTINUE + RPVGRW = ZLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ), + $ AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB, + $ RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ANORM / RPVGRW + END IF + RWORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = ZLANGB( NORM, N, KL, KU, AB, LDAB, RWORK ) + RPVGRW = ZLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ZLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL ZGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND, + $ WORK, RWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL ZGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX, + $ INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL ZGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV, + $ B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 120 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 140 J = 1, NRHS + DO 130 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 130 CONTINUE + 140 CONTINUE + DO 150 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 150 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.DLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + RWORK( 1 ) = RPVGRW + RETURN +* +* End of ZGBSVX +* + END diff --git a/lapack-netlib/zgejsv.f b/lapack-netlib/zgejsv.f new file mode 100644 index 000000000..5fe899e50 --- /dev/null +++ b/lapack-netlib/zgejsv.f @@ -0,0 +1,2234 @@ +*> \brief \b ZGEJSV +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGEJSV + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, +* M, N, A, LDA, SVA, U, LDU, V, LDV, +* CWORK, LWORK, RWORK, LRWORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* INTEGER INFO, LDA, LDU, LDV, LWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK ) +* DOUBLE PRECISION SVA( N ), RWORK( LRWORK ) +* INTEGER IWORK( * ) +* CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGEJSV computes the singular value decomposition (SVD) of a complex M-by-N +*> matrix [A], where M >= N. The SVD of [A] is written as +*> +*> [A] = [U] * [SIGMA] * [V]^*, +*> +*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N +*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and +*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are +*> the singular values of [A]. The columns of [U] and [V] are the left and +*> the right singular vectors of [A], respectively. The matrices [U] and [V] +*> are computed and stored in the arrays U and V, respectively. The diagonal +*> of [SIGMA] is computed and stored in the array SVA. +*> \endverbatim +*> +*> Arguments: +*> ========== +*> +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy: +*> = 'C': This option works well (high relative accuracy) if A = B * D, +*> with well-conditioned B and arbitrary diagonal matrix D. +*> The accuracy cannot be spoiled by COLUMN scaling. The +*> accuracy of the computed output depends on the condition of +*> B, and the procedure aims at the best theoretical accuracy. +*> The relative error max_{i=1:N}|d sigma_i| / sigma_i is +*> bounded by f(M,N)*epsilon* cond(B), independent of D. +*> The input matrix is preprocessed with the QRF with column +*> pivoting. This initial preprocessing and preconditioning by +*> a rank revealing QR factorization is common for all values of +*> JOBA. Additional actions are specified as follows: +*> = 'E': Computation as with 'C' with an additional estimate of the +*> condition number of B. It provides a realistic error bound. +*> = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings +*> D1, D2, and well-conditioned matrix C, this option gives +*> higher accuracy than the 'C' option. If the structure of the +*> input matrix is not known, and relative accuracy is +*> desirable, then this option is advisable. The input matrix A +*> is preprocessed with QR factorization with FULL (row and +*> column) pivoting. +*> = 'G': Computation as with 'F' with an additional estimate of the +*> condition number of B, where A=B*D. If A has heavily weighted +*> rows, then using this condition number gives too pessimistic +*> error bound. +*> = 'A': Small singular values are not well determined by the data +*> and are considered as noisy; the matrix is treated as +*> numerically rank deficient. The error in the computed +*> singular values is bounded by f(m,n)*epsilon*||A||. +*> The computed SVD A = U * S * V^* restores A up to +*> f(m,n)*epsilon*||A||. +*> This gives the procedure the licence to discard (set to zero) +*> all singular values below N*epsilon*||A||. +*> = 'R': Similar as in 'A'. Rank revealing property of the initial +*> QR factorization is used do reveal (using triangular factor) +*> a gap sigma_{r+1} < epsilon * sigma_r in which case the +*> numerical RANK is declared to be r. The SVD is computed with +*> absolute error bounds, but more accurately than with 'A'. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> Specifies whether to compute the columns of U: +*> = 'U': N columns of U are returned in the array U. +*> = 'F': full set of M left sing. vectors is returned in the array U. +*> = 'W': U may be used as workspace of length M*N. See the description +*> of U. +*> = 'N': U is not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> Specifies whether to compute the matrix V: +*> = 'V': N columns of V are returned in the array V; Jacobi rotations +*> are not explicitly accumulated. +*> = 'J': N columns of V are returned in the array V, but they are +*> computed as the product of Jacobi rotations, if JOBT = 'N'. +*> = 'W': V may be used as workspace of length N*N. See the description +*> of V. +*> = 'N': V is not computed. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> Specifies the RANGE for the singular values. Issues the licence to +*> set to zero small positive singular values if they are outside +*> specified range. If A .NE. 0 is scaled so that the largest singular +*> value of c*A is around SQRT(BIG), BIG=DLAMCH('O'), then JOBR issues +*> the licence to kill columns of A whose norm in c*A is less than +*> SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, +*> where SFMIN=DLAMCH('S'), EPSLN=DLAMCH('E'). +*> = 'N': Do not kill small columns of c*A. This option assumes that +*> BLAS and QR factorizations and triangular solvers are +*> implemented to work in that range. If the condition of A +*> is greater than BIG, use ZGESVJ. +*> = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)] +*> (roughly, as described above). This option is recommended. +*> =========================== +*> For computing the singular values in the FULL range [SFMIN,BIG] +*> use ZGESVJ. +*> \endverbatim +*> +*> \param[in] JOBT +*> \verbatim +*> JOBT is CHARACTER*1 +*> If the matrix is square then the procedure may determine to use +*> transposed A if A^* seems to be better with respect to convergence. +*> If the matrix is not square, JOBT is ignored. +*> The decision is based on two values of entropy over the adjoint +*> orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7). +*> = 'T': transpose if entropy test indicates possibly faster +*> convergence of Jacobi process if A^* is taken as input. If A is +*> replaced with A^*, then the row pivoting is included automatically. +*> = 'N': do not speculate. +*> The option 'T' can be used to compute only the singular values, or +*> the full SVD (U, SIGMA and V). For only one set of singular vectors +*> (U or V), the caller should provide both U and V, as one of the +*> matrices is used as workspace if the matrix A is transposed. +*> The implementer can easily remove this constraint and make the +*> code more complicated. See the descriptions of U and V. +*> In general, this option is considered experimental, and 'N'; should +*> be preferred. This is subject to changes in the future. +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> Issues the licence to introduce structured perturbations to drown +*> denormalized numbers. This licence should be active if the +*> denormals are poorly implemented, causing slow computation, +*> especially in cases of fast convergence (!). For details see [1,2]. +*> For the sake of simplicity, this perturbations are included only +*> when the full SVD or only the singular values are requested. The +*> implementer/user can easily add the perturbation for the cases of +*> computing one set of singular vectors. +*> = 'P': introduce perturbation +*> = 'N': do not perturb +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] SVA +*> \verbatim +*> SVA is DOUBLE PRECISION array, dimension (N) +*> On exit, +*> - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During +*> the computation SVA contains Euclidean column norms of the +*> iterated matrices in the array A. +*> - For RWORK(1) .NE. RWORK(2): The singular values of A are +*> (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if +*> sigma_max(A) overflows or if small singular values have been +*> saved from underflow by scaling the input matrix A. +*> - If JOBR='R' then some of the singular values may be returned +*> as exact zeros obtained by "set to zero" because they are +*> below the numerical rank threshold or are denormalized numbers. +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is COMPLEX*16 array, dimension ( LDU, N ) +*> If JOBU = 'U', then U contains on exit the M-by-N matrix of +*> the left singular vectors. +*> If JOBU = 'F', then U contains on exit the M-by-M matrix of +*> the left singular vectors, including an ONB +*> of the orthogonal complement of the Range(A). +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), +*> then U is used as workspace if the procedure +*> replaces A with A^*. In that case, [V] is computed +*> in U as left singular vectors of A^* and then +*> copied back to the V array. This 'W' option is just +*> a reminder to the caller that in this case U is +*> reserved as workspace of length N*N. +*> If JOBU = 'N' U is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER +*> The leading dimension of the array U, LDU >= 1. +*> IF JOBU = 'U' or 'F' or 'W', then LDU >= M. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is COMPLEX*16 array, dimension ( LDV, N ) +*> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of +*> the right singular vectors; +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), +*> then V is used as workspace if the pprocedure +*> replaces A with A^*. In that case, [U] is computed +*> in V as right singular vectors of A^* and then +*> copied back to the U array. This 'W' option is just +*> a reminder to the caller that in this case V is +*> reserved as workspace of length N*N. +*> If JOBV = 'N' V is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V' or 'J' or 'W', then LDV >= N. +*> \endverbatim +*> +*> \param[out] CWORK +*> \verbatim +*> CWORK is COMPLEX*16 array, dimension (MAX(2,LWORK)) +*> If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit CWORK(1) contains the required length of +*> CWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> Length of CWORK to confirm proper allocation of workspace. +*> LWORK depends on the job: +*> +*> 1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and +*> 1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'): +*> LWORK >= 2*N+1. This is the minimal requirement. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= N + (N+1)*NB. Here NB is the optimal +*> block size for ZGEQP3 and ZGEQRF. +*> In general, optimal LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ)). +*> 1.2. .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). In this case, LWORK the minimal +*> requirement is LWORK >= N*N + 2*N. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ), +*> N*N+LWORK(ZPOCON)). +*> 2. If SIGMA and the right singular vectors are needed (JOBV = 'V'), +*> (JOBU = 'N') +*> 2.1 .. no scaled condition estimate requested (JOBE = 'N'): +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance, +*> LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF, +*> ZUNMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3), N+LWORK(ZGESVJ), +*> N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)). +*> 2.2 .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance, +*> LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF, +*> ZUNMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3), LWORK(ZPOCON), N+LWORK(ZGESVJ), +*> N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)). +*> 3. If SIGMA and the left singular vectors are needed +*> 3.1 .. no scaled condition estimate requested (JOBE = 'N'): +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). +*> 3.2 .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZPOCON), +*> 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). +*> 4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> 4.1. if JOBV = 'V' +*> the minimal requirement is LWORK >= 5*N+2*N*N. +*> 4.2. if JOBV = 'J' the minimal requirement is +*> LWORK >= 4*N+N*N. +*> In both cases, the allocated CWORK can accommodate blocked runs +*> of ZGEQP3, ZGEQRF, ZGELQF, SUNMQR, ZUNMLQ. +*> +*> If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the +*> minimal length of CWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (MAX(7,LRWORK)) +*> On exit, +*> RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1) +*> such that SCALE*SVA(1:N) are the computed singular values +*> of A. (See the description of SVA().) +*> RWORK(2) = See the description of RWORK(1). +*> RWORK(3) = SCONDA is an estimate for the condition number of +*> column equilibrated A. (If JOBA = 'E' or 'G') +*> SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). +*> It is computed using ZPOCON. It holds +*> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +*> where R is the triangular factor from the QRF of A. +*> However, if R is truncated and the numerical rank is +*> determined to be strictly smaller than N, SCONDA is +*> returned as -1, thus indicating that the smallest +*> singular values might be lost. +*> +*> If full SVD is needed, the following two condition numbers are +*> useful for the analysis of the algorithm. They are provided for +*> a developer/implementer who is familiar with the details of +*> the method. +*> +*> RWORK(4) = an estimate of the scaled condition number of the +*> triangular factor in the first QR factorization. +*> RWORK(5) = an estimate of the scaled condition number of the +*> triangular factor in the second QR factorization. +*> The following two parameters are computed if JOBT = 'T'. +*> They are provided for a developer/implementer who is familiar +*> with the details of the method. +*> RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy +*> of diag(A^* * A) / Trace(A^* * A) taken as point in the +*> probability simplex. +*> RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).) +*> If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit RWORK(1) contains the required length of +*> RWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> Length of RWORK to confirm proper allocation of workspace. +*> LRWORK depends on the job: +*> +*> 1. If only the singular values are requested i.e. if +*> LSAME(JOBU,'N') .AND. LSAME(JOBV,'N') +*> then: +*> 1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then: LRWORK = max( 7, 2 * M ). +*> 1.2. Otherwise, LRWORK = max( 7, N ). +*> 2. If singular values with the right singular vectors are requested +*> i.e. if +*> (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND. +*> .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) +*> then: +*> 2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 2.2. Otherwise, LRWORK = max( 7, N ). +*> 3. If singular values with the left singular vectors are requested, i.e. if +*> (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND. +*> .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) +*> then: +*> 3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 3.2. Otherwise, LRWORK = max( 7, N ). +*> 4. If singular values with both the left and the right singular vectors +*> are requested, i.e. if +*> (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND. +*> (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) +*> then: +*> 4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 4.2. Otherwise, LRWORK = max( 7, N ). +*> +*> If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and +*> the length of RWORK is returned in RWORK(1). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, of dimension at least 4, that further depends +*> on the job: +*> +*> 1. If only the singular values are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 2. If the singular values and the right singular vectors are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 3. If the singular values and the left singular vectors are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 4. If the singular values with both the left and the right singular vectors +*> are requested, then: +*> 4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N. +*> +*> On exit, +*> IWORK(1) = the numerical rank determined after the initial +*> QR factorization with pivoting. See the descriptions +*> of JOBA and JOBR. +*> IWORK(2) = the number of the computed nonzero singular values +*> IWORK(3) = if nonzero, a warning message: +*> If IWORK(3) = 1 then some of the column norms of A +*> were denormalized floats. The requested high accuracy +*> is not warranted by the data. +*> IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to +*> do the job as specified by the JOB parameters. +*> If the call to ZGEJSV is a workspace query (indicated by LWORK = -1 or +*> LRWORK = -1), then on exit IWORK(1) contains the required length of +*> IWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: ZGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GEsing +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> ZGEJSV implements a preconditioned Jacobi SVD algorithm. It uses ZGEQP3, +*> ZGEQRF, and ZGELQF as preprocessors and preconditioners. Optionally, an +*> additional row pivoting can be used as a preprocessor, which in some +*> cases results in much higher accuracy. An example is matrix A with the +*> structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned +*> diagonal matrices and C is well-conditioned matrix. In that case, complete +*> pivoting in the first QR factorizations provides accuracy dependent on the +*> condition number of C, and independent of D1, D2. Such higher accuracy is +*> not completely understood theoretically, but it works well in practice. +*> Further, if A can be written as A = B*D, with well-conditioned B and some +*> diagonal D, then the high accuracy is guaranteed, both theoretically and +*> in software, independent of D. For more details see [1], [2]. +*> The computational range for the singular values can be the full range +*> ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS +*> & LAPACK routines called by ZGEJSV are implemented to work in that range. +*> If that is not the case, then the restriction for safe computation with +*> the singular values in the range of normalized IEEE numbers is that the +*> spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not +*> overflow. This code (ZGEJSV) is best used in this restricted range, +*> meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are +*> returned as zeros. See JOBR for details on this. +*> Further, this implementation is somewhat slower than the one described +*> in [1,2] due to replacement of some non-LAPACK components, and because +*> the choice of some tuning parameters in the iterative part (ZGESVJ) is +*> left to the implementer on a particular machine. +*> The rank revealing QR factorization (in this code: ZGEQP3) should be +*> implemented as in [3]. We have a new version of ZGEQP3 under development +*> that is more robust than the current one in LAPACK, with a cleaner cut in +*> rank deficient cases. It will be available in the SIGMA library [4]. +*> If M is much larger than N, it is obvious that the initial QRF with +*> column pivoting can be preprocessed by the QRF without pivoting. That +*> well known trick is not used in ZGEJSV because in some cases heavy row +*> weighting can be treated with complete pivoting. The overhead in cases +*> M much larger than N is then only due to pivoting, but the benefits in +*> terms of accuracy have prevailed. The implementer/user can incorporate +*> this extra QRF step easily. The implementer can also improve data movement +*> (matrix transpose, matrix copy, matrix transposed copy) - this +*> implementation of ZGEJSV uses only the simplest, naive data movement. +*> \endverbatim +* +*> \par Contributor: +* ================== +*> +*> Zlatko Drmac, Department of Mathematics, Faculty of Science, +*> University of Zagreb (Zagreb, Croatia); drmac@math.hr +* +*> \par References: +* ================ +*> +*> \verbatim +*> +*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342. +*> LAPACK Working note 169. +*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362. +*> LAPACK Working note 170. +*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR +*> factorization software - a case study. +*> ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28. +*> LAPACK Working note 176. +*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV, +*> QSVD, (H,K)-SVD computations. +*> Department of Mathematics, University of Zagreb, 2008, 2016. +*> \endverbatim +* +*> \par Bugs, examples and comments: +* ================================= +*> +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> +* ===================================================================== + SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, + $ M, N, A, LDA, SVA, U, LDU, V, LDV, + $ CWORK, LWORK, RWORK, LRWORK, IWORK, INFO ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + IMPLICIT NONE + INTEGER INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), U( LDU, * ), V( LDV, * ), + $ CWORK( LWORK ) + DOUBLE PRECISION SVA( N ), RWORK( LRWORK ) + INTEGER IWORK( * ) + CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* =========================================================================== +* +* .. Local Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ), CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + COMPLEX*16 CTEMP + DOUBLE PRECISION AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, + $ COND_OK, CONDR1, CONDR2, ENTRA, ENTRAT, EPSLN, + $ MAXPRJ, SCALEM, SCONDA, SFMIN, SMALL, TEMP1, + $ USCAL1, USCAL2, XSC + INTEGER IERR, N1, NR, NUMRANK, p, q, WARNING + LOGICAL ALMORT, DEFR, ERREST, GOSCAL, JRACC, KILL, LQUERY, + $ LSVEC, L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN, NOSCAL, + $ ROWPIV, RSVEC, TRANSP +* + INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK + INTEGER LWCON, LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM, + $ LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF + INTEGER LWRK_ZGELQF, LWRK_ZGEQP3, LWRK_ZGEQP3N, LWRK_ZGEQRF, + $ LWRK_ZGESVJ, LWRK_ZGESVJV, LWRK_ZGESVJU, LWRK_ZUNMLQ, + $ LWRK_ZUNMQR, LWRK_ZUNMQRM +* .. +* .. Local Arrays + COMPLEX*16 CDUMMY(1) + DOUBLE PRECISION RDUMMY(1) +* +* .. Intrinsic Functions .. + INTRINSIC ABS, DCMPLX, CONJG, DLOG, MAX, MIN, DBLE, NINT, SQRT +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DZNRM2 + INTEGER IDAMAX, IZAMAX + LOGICAL LSAME + EXTERNAL IDAMAX, IZAMAX, LSAME, DLAMCH, DZNRM2 +* .. +* .. External Subroutines .. + EXTERNAL DLASSQ, ZCOPY, ZGELQF, ZGEQP3, ZGEQRF, ZLACPY, ZLAPMR, + $ ZLASCL, DLASCL, ZLASET, ZLASSQ, ZLASWP, ZUNGQR, ZUNMLQ, + $ ZUNMQR, ZPOCON, DSCAL, ZDSCAL, ZSWAP, ZTRSM, ZLACGV, + $ XERBLA +* + EXTERNAL ZGESVJ +* .. +* +* Test the input arguments +* + LSVEC = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' ) + JRACC = LSAME( JOBV, 'J' ) + RSVEC = LSAME( JOBV, 'V' ) .OR. JRACC + ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' ) + L2RANK = LSAME( JOBA, 'R' ) + L2ABER = LSAME( JOBA, 'A' ) + ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' ) + L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N ) + L2KILL = LSAME( JOBR, 'R' ) + DEFR = LSAME( JOBR, 'N' ) + L2PERT = LSAME( JOBP, 'P' ) +* + LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) +* + IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR. + $ ERREST .OR. LSAME( JOBA, 'C' ) )) THEN + INFO = - 1 + ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR. + $ ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN + INFO = - 2 + ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR. + $ ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN + INFO = - 3 + ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) ) THEN + INFO = - 4 + ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN + INFO = - 5 + ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = - 6 + ELSE IF ( M .LT. 0 ) THEN + INFO = - 7 + ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN + INFO = - 8 + ELSE IF ( LDA .LT. M ) THEN + INFO = - 10 + ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN + INFO = - 13 + ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN + INFO = - 15 + ELSE +* #:) + INFO = 0 + END IF +* + IF ( INFO .EQ. 0 ) THEN +* .. compute the minimal and the optimal workspace lengths +* [[The expressions for computing the minimal and the optimal +* values of LCWORK, LRWORK are written with a lot of redundancy and +* can be simplified. However, this verbose form is useful for +* maintenance and modifications of the code.]] +* +* .. minimal workspace length for ZGEQP3 of an M x N matrix, +* ZGEQRF of an N x N matrix, ZGELQF of an N x N matrix, +* ZUNMLQ for computing N x N matrix, ZUNMQR for computing N x N +* matrix, ZUNMQR for computing M x N matrix, respectively. + LWQP3 = N+1 + LWQRF = MAX( 1, N ) + LWLQF = MAX( 1, N ) + LWUNMLQ = MAX( 1, N ) + LWUNMQR = MAX( 1, N ) + LWUNMQRM = MAX( 1, M ) +* .. minimal workspace length for ZPOCON of an N x N matrix + LWCON = 2 * N +* .. minimal workspace length for ZGESVJ of an N x N matrix, +* without and with explicit accumulation of Jacobi rotations + LWSVDJ = MAX( 2 * N, 1 ) + LWSVDJV = MAX( 2 * N, 1 ) +* .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ + LRWQP3 = 2 * N + LRWCON = N + LRWSVDJ = N + IF ( LQUERY ) THEN + CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_ZGEQP3 = INT( CDUMMY(1) ) + CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) + LWRK_ZGEQRF = INT( CDUMMY(1) ) + CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) + LWRK_ZGELQF = INT( CDUMMY(1) ) + END IF + MINWRK = 2 + OPTWRK = 2 + MINIWRK = N + IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN +* .. minimal and optimal sizes of the complex workspace if +* only the singular values are requested + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ ) + ELSE + MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ ) + END IF + IF ( LQUERY ) THEN + CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, + $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, + $ N+LWRK_ZGEQRF, LWRK_ZGESVJ ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWRK_ZGEQRF, + $ LWRK_ZGESVJ ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWCON, LRWSVDJ ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the right singular vectors are requested + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF, + $ 2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ ) + ELSE + MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, + $ N+LWSVDJ, N+LWUNMLQ ) + END IF + IF ( LQUERY ) THEN + CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, + $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) + CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, + $ N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF, + $ N+LWRK_ZGESVJ, N+LWRK_ZUNMLQ ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, LWRK_ZGESVJ,N+LWRK_ZGELQF, + $ 2*N+LWRK_ZGEQRF, N+LWRK_ZGESVJ, + $ N+LWRK_ZUNMLQ ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the left singular vectors are requested + IF ( ERREST ) THEN + MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM ) + ELSE + MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM ) + END IF + IF ( LQUERY ) THEN + CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, + $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) + CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF, + $ LWRK_ZGESVJ, LWRK_ZUNMQRM ) + ELSE + OPTWRK = N + MAX( LWRK_ZGEQP3, N+LWRK_ZGEQRF, + $ LWRK_ZGESVJ, LWRK_ZUNMQRM ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE +* .. minimal and optimal sizes of the complex workspace if the +* full SVD is requested + IF ( .NOT. JRACC ) THEN + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+N**2+LWCON, + $ 2*N+LWQRF, 2*N+LWQP3, + $ 2*N+N**2+N+LWLQF, 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, + $ 2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, + $ N+N**2+LWSVDJ, N+LWUNMQRM ) + ELSE + MINWRK = MAX( N+LWQP3, 2*N+N**2+LWCON, + $ 2*N+LWQRF, 2*N+LWQP3, + $ 2*N+N**2+N+LWLQF, 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, + $ 2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, + $ N+N**2+LWSVDJ, N+LWUNMQRM ) + END IF + MINIWRK = MINIWRK + N + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, + $ 2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, + $ N+LWUNMQRM ) + ELSE + MINWRK = MAX( N+LWQP3, 2*N+LWQRF, + $ 2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, + $ N+LWUNMQRM ) + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + END IF + IF ( LQUERY ) THEN + CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) + CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) + IF ( .NOT. JRACC ) THEN + CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_ZGEQP3N = INT( CDUMMY(1) ) + CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) + CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJU = INT( CDUMMY(1) ) + CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) + CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, + $ 2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, + $ 2*N+LWRK_ZGEQP3N, + $ 2*N+N**2+N+LWRK_ZGELQF, + $ 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWRK_ZGESVJ, + $ 2*N+N**2+N+LWRK_ZGESVJV, + $ 2*N+N**2+N+LWRK_ZUNMQR, + $ 2*N+N**2+N+LWRK_ZUNMLQ, + $ N+N**2+LWRK_ZGESVJU, + $ N+LWRK_ZUNMQRM ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, + $ 2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, + $ 2*N+LWRK_ZGEQP3N, + $ 2*N+N**2+N+LWRK_ZGELQF, + $ 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWRK_ZGESVJ, + $ 2*N+N**2+N+LWRK_ZGESVJV, + $ 2*N+N**2+N+LWRK_ZUNMQR, + $ 2*N+N**2+N+LWRK_ZUNMLQ, + $ N+N**2+LWRK_ZGESVJU, + $ N+LWRK_ZUNMQRM ) + END IF + ELSE + CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) + CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) + CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, + $ 2*N+LWRK_ZGEQRF, 2*N+N**2, + $ 2*N+N**2+LWRK_ZGESVJV, + $ 2*N+N**2+N+LWRK_ZUNMQR,N+LWRK_ZUNMQRM ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, 2*N+LWRK_ZGEQRF, + $ 2*N+N**2, 2*N+N**2+LWRK_ZGESVJV, + $ 2*N+N**2+N+LWRK_ZUNMQR, + $ N+LWRK_ZUNMQRM ) + END IF + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + END IF + END IF + MINWRK = MAX( 2, MINWRK ) + OPTWRK = MAX( MINWRK, OPTWRK ) + IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 + IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 + END IF +* + IF ( INFO .NE. 0 ) THEN +* #:( + CALL XERBLA( 'ZGEJSV', - INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + CWORK(1) = OPTWRK + CWORK(2) = MINWRK + RWORK(1) = MINRWRK + IWORK(1) = MAX( 4, MINIWRK ) + RETURN + END IF +* +* Quick return for void matrix (Y3K safe) +* #:) + IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN + IWORK(1:4) = 0 + RWORK(1:7) = 0 + RETURN + ENDIF +* +* Determine whether the matrix U should be M x N or M x M +* + IF ( LSVEC ) THEN + N1 = N + IF ( LSAME( JOBU, 'F' ) ) N1 = M + END IF +* +* Set numerical parameters +* +*! NOTE: Make sure DLAMCH() does not fail on the target architecture. +* + EPSLN = DLAMCH('Epsilon') + SFMIN = DLAMCH('SafeMinimum') + SMALL = SFMIN / EPSLN + BIG = DLAMCH('O') +* BIG = ONE / SFMIN +* +* Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N +* +*(!) If necessary, scale SVA() to protect the largest norm from +* overflow. It is possible that this scaling pushes the smallest +* column norm left from the underflow threshold (extreme case). +* + SCALEM = ONE / SQRT(DBLE(M)*DBLE(N)) + NOSCAL = .TRUE. + GOSCAL = .TRUE. + DO 1874 p = 1, N + AAPP = ZERO + AAQQ = ONE + CALL ZLASSQ( M, A(1,p), 1, AAPP, AAQQ ) + IF ( AAPP .GT. BIG ) THEN + INFO = - 9 + CALL XERBLA( 'ZGEJSV', -INFO ) + RETURN + END IF + AAQQ = SQRT(AAQQ) + IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL ) THEN + SVA(p) = AAPP * AAQQ + ELSE + NOSCAL = .FALSE. + SVA(p) = AAPP * ( AAQQ * SCALEM ) + IF ( GOSCAL ) THEN + GOSCAL = .FALSE. + CALL DSCAL( p-1, SCALEM, SVA, 1 ) + END IF + END IF + 1874 CONTINUE +* + IF ( NOSCAL ) SCALEM = ONE +* + AAPP = ZERO + AAQQ = BIG + DO 4781 p = 1, N + AAPP = MAX( AAPP, SVA(p) ) + IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) ) + 4781 CONTINUE +* +* Quick return for zero M x N matrix +* #:) + IF ( AAPP .EQ. ZERO ) THEN + IF ( LSVEC ) CALL ZLASET( 'G', M, N1, CZERO, CONE, U, LDU ) + IF ( RSVEC ) CALL ZLASET( 'G', N, N, CZERO, CONE, V, LDV ) + RWORK(1) = ONE + RWORK(2) = ONE + IF ( ERREST ) RWORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = ONE + RWORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ZERO + RWORK(7) = ZERO + END IF + IWORK(1) = 0 + IWORK(2) = 0 + IWORK(3) = 0 + IWORK(4) = -1 + RETURN + END IF +* +* Issue warning if denormalized column norms detected. Override the +* high relative accuracy request. Issue licence to kill nonzero columns +* (set them to zero) whose norm is less than sigma_max / BIG (roughly). +* #:( + WARNING = 0 + IF ( AAQQ .LE. SFMIN ) THEN + L2RANK = .TRUE. + L2KILL = .TRUE. + WARNING = 1 + END IF +* +* Quick return for one-column matrix +* #:) + IF ( N .EQ. 1 ) THEN +* + IF ( LSVEC ) THEN + CALL ZLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR ) + CALL ZLACPY( 'A', M, 1, A, LDA, U, LDU ) +* computing all M left singular vectors of the M x 1 matrix + IF ( N1 .NE. N ) THEN + CALL ZGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR ) + CALL ZUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR ) + CALL ZCOPY( M, A(1,1), 1, U(1,1), 1 ) + END IF + END IF + IF ( RSVEC ) THEN + V(1,1) = CONE + END IF + IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN + SVA(1) = SVA(1) / SCALEM + SCALEM = ONE + END IF + RWORK(1) = ONE / SCALEM + RWORK(2) = ONE + IF ( SVA(1) .NE. ZERO ) THEN + IWORK(1) = 1 + IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN + IWORK(2) = 1 + ELSE + IWORK(2) = 0 + END IF + ELSE + IWORK(1) = 0 + IWORK(2) = 0 + END IF + IWORK(3) = 0 + IWORK(4) = -1 + IF ( ERREST ) RWORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = ONE + RWORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ZERO + RWORK(7) = ZERO + END IF + RETURN +* + END IF +* + TRANSP = .FALSE. +* + AATMAX = -ONE + AATMIN = BIG + IF ( ROWPIV .OR. L2TRAN ) THEN +* +* Compute the row norms, needed to determine row pivoting sequence +* (in the case of heavily row weighted A, row pivoting is strongly +* advised) and to collect information needed to compare the +* structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.). +* + IF ( L2TRAN ) THEN + DO 1950 p = 1, M + XSC = ZERO + TEMP1 = ONE + CALL ZLASSQ( N, A(p,1), LDA, XSC, TEMP1 ) +* ZLASSQ gets both the ell_2 and the ell_infinity norm +* in one pass through the vector + RWORK(M+p) = XSC * SCALEM + RWORK(p) = XSC * (SCALEM*SQRT(TEMP1)) + AATMAX = MAX( AATMAX, RWORK(p) ) + IF (RWORK(p) .NE. ZERO) + $ AATMIN = MIN(AATMIN,RWORK(p)) + 1950 CONTINUE + ELSE + DO 1904 p = 1, M + RWORK(M+p) = SCALEM*ABS( A(p,IZAMAX(N,A(p,1),LDA)) ) + AATMAX = MAX( AATMAX, RWORK(M+p) ) + AATMIN = MIN( AATMIN, RWORK(M+p) ) + 1904 CONTINUE + END IF +* + END IF +* +* For square matrix A try to determine whether A^* would be better +* input for the preconditioned Jacobi SVD, with faster convergence. +* The decision is based on an O(N) function of the vector of column +* and row norms of A, based on the Shannon entropy. This should give +* the right choice in most cases when the difference actually matters. +* It may fail and pick the slower converging side. +* + ENTRA = ZERO + ENTRAT = ZERO + IF ( L2TRAN ) THEN +* + XSC = ZERO + TEMP1 = ONE + CALL DLASSQ( N, SVA, 1, XSC, TEMP1 ) + TEMP1 = ONE / TEMP1 +* + ENTRA = ZERO + DO 1113 p = 1, N + BIG1 = ( ( SVA(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1) + 1113 CONTINUE + ENTRA = - ENTRA / DLOG(DBLE(N)) +* +* Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex. +* It is derived from the diagonal of A^* * A. Do the same with the +* diagonal of A * A^*, compute the entropy of the corresponding +* probability distribution. Note that A * A^* and A^* * A have the +* same trace. +* + ENTRAT = ZERO + DO 1114 p = 1, M + BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1) + 1114 CONTINUE + ENTRAT = - ENTRAT / DLOG(DBLE(M)) +* +* Analyze the entropies and decide A or A^*. Smaller entropy +* usually means better input for the algorithm. +* + TRANSP = ( ENTRAT .LT. ENTRA ) +* +* If A^* is better than A, take the adjoint of A. This is allowed +* only for square matrices, M=N. + IF ( TRANSP ) THEN +* In an optimal implementation, this trivial transpose +* should be replaced with faster transpose. + DO 1115 p = 1, N - 1 + A(p,p) = CONJG(A(p,p)) + DO 1116 q = p + 1, N + CTEMP = CONJG(A(q,p)) + A(q,p) = CONJG(A(p,q)) + A(p,q) = CTEMP + 1116 CONTINUE + 1115 CONTINUE + A(N,N) = CONJG(A(N,N)) + DO 1117 p = 1, N + RWORK(M+p) = SVA(p) + SVA(p) = RWORK(p) +* previously computed row 2-norms are now column 2-norms +* of the transposed matrix + 1117 CONTINUE + TEMP1 = AAPP + AAPP = AATMAX + AATMAX = TEMP1 + TEMP1 = AAQQ + AAQQ = AATMIN + AATMIN = TEMP1 + KILL = LSVEC + LSVEC = RSVEC + RSVEC = KILL + IF ( LSVEC ) N1 = N +* + ROWPIV = .TRUE. + END IF +* + END IF +* END IF L2TRAN +* +* Scale the matrix so that its maximal singular value remains less +* than SQRT(BIG) -- the matrix is scaled so that its maximal column +* has Euclidean norm equal to SQRT(BIG/N). The only reason to keep +* SQRT(BIG) instead of BIG is the fact that ZGEJSV uses LAPACK and +* BLAS routines that, in some implementations, are not capable of +* working in the full interval [SFMIN,BIG] and that they may provoke +* overflows in the intermediate results. If the singular values spread +* from SFMIN to BIG, then ZGESVJ will compute them. So, in that case, +* one should use ZGESVJ instead of ZGEJSV. +* >> change in the April 2016 update: allow bigger range, i.e. the +* largest column is allowed up to BIG/N and ZGESVJ will do the rest. + BIG1 = SQRT( BIG ) + TEMP1 = SQRT( BIG / DBLE(N) ) +* TEMP1 = BIG/DBLE(N) +* + CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR ) + IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN + AAQQ = ( AAQQ / AAPP ) * TEMP1 + ELSE + AAQQ = ( AAQQ * TEMP1 ) / AAPP + END IF + TEMP1 = TEMP1 * SCALEM + CALL ZLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR ) +* +* To undo scaling at the end of this procedure, multiply the +* computed singular values with USCAL2 / USCAL1. +* + USCAL1 = TEMP1 + USCAL2 = AAPP +* + IF ( L2KILL ) THEN +* L2KILL enforces computation of nonzero singular values in +* the restricted range of condition number of the initial A, +* sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN). + XSC = SQRT( SFMIN ) + ELSE + XSC = SMALL +* +* Now, if the condition number of A is too big, +* sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN, +* as a precaution measure, the full SVD is computed using ZGESVJ +* with accumulated Jacobi rotations. This provides numerically +* more robust computation, at the cost of slightly increased run +* time. Depending on the concrete implementation of BLAS and LAPACK +* (i.e. how they behave in presence of extreme ill-conditioning) the +* implementor may decide to remove this switch. + IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN + JRACC = .TRUE. + END IF +* + END IF + IF ( AAQQ .LT. XSC ) THEN + DO 700 p = 1, N + IF ( SVA(p) .LT. XSC ) THEN + CALL ZLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA ) + SVA(p) = ZERO + END IF + 700 CONTINUE + END IF +* +* Preconditioning using QR factorization with pivoting +* + IF ( ROWPIV ) THEN +* Optional row permutation (Bjoerck row pivoting): +* A result by Cox and Higham shows that the Bjoerck's +* row pivoting combined with standard column pivoting +* has similar effect as Powell-Reid complete pivoting. +* The ell-infinity norms of A are made nonincreasing. + IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN + IWOFF = 2*N + ELSE + IWOFF = N + END IF + DO 1952 p = 1, M - 1 + q = IDAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1 + IWORK(IWOFF+p) = q + IF ( p .NE. q ) THEN + TEMP1 = RWORK(M+p) + RWORK(M+p) = RWORK(M+q) + RWORK(M+q) = TEMP1 + END IF + 1952 CONTINUE + CALL ZLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 ) + END IF +* +* End of the preparation phase (scaling, optional sorting and +* transposing, optional flushing of small columns). +* +* Preconditioning +* +* If the full SVD is needed, the right singular vectors are computed +* from a matrix equation, and for that we need theoretical analysis +* of the Businger-Golub pivoting. So we use ZGEQP3 as the first RR QRF. +* In all other cases the first RR QRF can be chosen by other criteria +* (eg speed by replacing global with restricted window pivoting, such +* as in xGEQPX from TOMS # 782). Good results will be obtained using +* xGEQPX with properly (!) chosen numerical parameters. +* Any improvement of ZGEQP3 improves overall performance of ZGEJSV. +* +* A * P1 = Q1 * [ R1^* 0]^*: + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL ZGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N, + $ RWORK, IERR ) +* +* The upper triangular matrix R1 from the first QRF is inspected for +* rank deficiency and possibilities for deflation, or possible +* ill-conditioning. Depending on the user specified flag L2RANK, +* the procedure explores possibilities to reduce the numerical +* rank by inspecting the computed upper triangular factor. If +* L2RANK or L2ABER are up, then ZGEJSV will compute the SVD of +* A + dA, where ||dA|| <= f(M,N)*EPSLN. +* + NR = 1 + IF ( L2ABER ) THEN +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPSLN*||A|| are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPSLN*||A||. + TEMP1 = SQRT(DBLE(N))*EPSLN + DO 3001 p = 2, N + IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN + NR = NR + 1 + ELSE + GO TO 3002 + END IF + 3001 CONTINUE + 3002 CONTINUE + ELSE IF ( L2RANK ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R1 is used as the criterion for +* close-to-rank-deficient. + TEMP1 = SQRT(SFMIN) + DO 3401 p = 2, N + IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. + $ ( ABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* The goal is high relative accuracy. However, if the matrix +* has high scaled condition number the relative accuracy is in +* general not feasible. Later on, a condition number estimator +* will be deployed to estimate the scaled condition number. +* Here we just remove the underflowed part of the triangular +* factor. This prevents the situation in which the code is +* working hard to get the accuracy not warranted by the data. + TEMP1 = SQRT(SFMIN) + DO 3301 p = 2, N + IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302 + NR = NR + 1 + 3301 CONTINUE + 3302 CONTINUE +* + END IF +* + ALMORT = .FALSE. + IF ( NR .EQ. N ) THEN + MAXPRJ = ONE + DO 3051 p = 2, N + TEMP1 = ABS(A(p,p)) / SVA(IWORK(p)) + MAXPRJ = MIN( MAXPRJ, TEMP1 ) + 3051 CONTINUE + IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE. + END IF +* +* + SCONDA = - ONE + CONDR1 = - ONE + CONDR2 = - ONE +* + IF ( ERREST ) THEN + IF ( N .EQ. NR ) THEN + IF ( RSVEC ) THEN +* .. V is available as workspace + CALL ZLACPY( 'U', N, N, A, LDA, V, LDV ) + DO 3053 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL ZDSCAL( p, ONE/TEMP1, V(1,p), 1 ) + 3053 CONTINUE + IF ( LSVEC )THEN + CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ CWORK(N+1), RWORK, IERR ) + ELSE + CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ CWORK, RWORK, IERR ) + END IF +* + ELSE IF ( LSVEC ) THEN +* .. U is available as workspace + CALL ZLACPY( 'U', N, N, A, LDA, U, LDU ) + DO 3054 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL ZDSCAL( p, ONE/TEMP1, U(1,p), 1 ) + 3054 CONTINUE + CALL ZPOCON( 'U', N, U, LDU, ONE, TEMP1, + $ CWORK(N+1), RWORK, IERR ) + ELSE + CALL ZLACPY( 'U', N, N, A, LDA, CWORK, N ) +*[] CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N ) +* Change: here index shifted by N to the left, CWORK(1:N) +* not needed for SIGMA only computation + DO 3052 p = 1, N + TEMP1 = SVA(IWORK(p)) +*[] CALL ZDSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 ) + CALL ZDSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 ) + 3052 CONTINUE +* .. the columns of R are scaled to have unit Euclidean lengths. +*[] CALL ZPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1, +*[] $ CWORK(N+N*N+1), RWORK, IERR ) + CALL ZPOCON( 'U', N, CWORK, N, ONE, TEMP1, + $ CWORK(N*N+1), RWORK, IERR ) +* + END IF + IF ( TEMP1 .NE. ZERO ) THEN + SCONDA = ONE / SQRT(TEMP1) + ELSE + SCONDA = - ONE + END IF +* SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA + ELSE + SCONDA = - ONE + END IF + END IF +* + L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) ) +* If there is no violent scaling, artificial perturbation is not needed. +* +* Phase 3: +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +* +* Singular Values only +* +* .. transpose A(1:NR,1:N) + DO 1946 p = 1, MIN( N-1, NR ) + CALL ZCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 ) + CALL ZLACGV( N-p+1, A(p,p), 1 ) + 1946 CONTINUE + IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N)) +* +* The following two DO-loops introduce small relative perturbation +* into the strict upper triangle of the lower triangular matrix. +* Small entries below the main diagonal are also changed. +* This modification is useful if the computing environment does not +* provide/allow FLUSH TO ZERO underflow, for it prevents many +* annoying denormalized numbers in case of strongly scaled matrices. +* The perturbation is structured so that it does not introduce any +* new perturbation of the singular values, and it does not destroy +* the job done by the preconditioner. +* The licence for this perturbation is in the variable L2PERT, which +* should be .FALSE. if FLUSH TO ZERO underflow is active. +* + IF ( .NOT. ALMORT ) THEN +* + IF ( L2PERT ) THEN +* XSC = SQRT(SMALL) + XSC = EPSLN / DBLE(N) + DO 4947 q = 1, NR + CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO) + DO 4949 p = 1, N + IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) +* $ A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) ) + $ A(p,q) = CTEMP + 4949 CONTINUE + 4947 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA ) + END IF +* +* .. second preconditioning using the QR factorization +* + CALL ZGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR ) +* +* .. and transpose upper to lower triangular + DO 1948 p = 1, NR - 1 + CALL ZCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 ) + CALL ZLACGV( NR-p+1, A(p,p), 1 ) + 1948 CONTINUE +* + END IF +* +* Row-cyclic Jacobi SVD algorithm with column pivoting +* +* .. again some perturbation (a "background noise") is added +* to drown denormals + IF ( L2PERT ) THEN +* XSC = SQRT(SMALL) + XSC = EPSLN / DBLE(N) + DO 1947 q = 1, NR + CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO) + DO 1949 p = 1, NR + IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) +* $ A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) ) + $ A(p,q) = CTEMP + 1949 CONTINUE + 1947 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA ) + END IF +* +* .. and one-sided Jacobi rotations are started on a lower +* triangular matrix (plus perturbation which is ignored in +* the part which destroys triangular form (confusing?!)) +* + CALL ZGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA, + $ N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO ) +* + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) +* +* + ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) ) + $ .OR. + $ ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN +* +* -> Singular Values and Right Singular Vectors <- +* + IF ( ALMORT ) THEN +* +* .. in this case NR equals N + DO 1998 p = 1, NR + CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL ZLACGV( N-p+1, V(p,p), 1 ) + 1998 CONTINUE + CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV ) +* + CALL ZGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA, + $ CWORK, LWORK, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + + ELSE +* +* .. two more QR factorizations ( one QRF is not enough, two require +* accumulated product of Jacobi rotations, three are perfect ) +* + CALL ZLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA ) + CALL ZGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR) + CALL ZLACPY( 'L', NR, NR, A, LDA, V, LDV ) + CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV ) + CALL ZGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) + DO 8998 p = 1, NR + CALL ZCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 ) + CALL ZLACGV( NR-p+1, V(p,p), 1 ) + 8998 CONTINUE + CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV) +* + CALL ZGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U, + $ LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + IF ( NR .LT. N ) THEN + CALL ZLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1), LDV ) + CALL ZLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1), LDV ) + CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV ) + END IF +* + CALL ZUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK, + $ V, LDV, CWORK(N+1), LWORK-N, IERR ) +* + END IF +* .. permute the rows of V +* DO 8991 p = 1, N +* CALL ZCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA ) +* 8991 CONTINUE +* CALL ZLACPY( 'All', N, N, A, LDA, V, LDV ) + CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK ) +* + IF ( TRANSP ) THEN + CALL ZLACPY( 'A', N, N, V, LDV, U, LDU ) + END IF +* + ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN +* + CALL ZLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA ) +* + CALL ZGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV, + $ CWORK, LWORK, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK ) +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN +* +* .. Singular Values and Left Singular Vectors .. +* +* .. second preconditioning step to avoid need to accumulate +* Jacobi rotations in the Jacobi iterations. + DO 1965 p = 1, NR + CALL ZCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 ) + CALL ZLACGV( N-p+1, U(p,p), 1 ) + 1965 CONTINUE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) +* + CALL ZGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) +* + DO 1967 p = 1, NR - 1 + CALL ZCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 ) + CALL ZLACGV( N-p+1, U(p,p), 1 ) + 1967 CONTINUE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) +* + CALL ZGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A, + $ LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) +* + IF ( NR .LT. M ) THEN + CALL ZLASET( 'A', M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL ZLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU ) + CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU ) + END IF + END IF +* + CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + DO 1974 p = 1, N1 + XSC = ONE / DZNRM2( M, U(1,p), 1 ) + CALL ZDSCAL( M, XSC, U(1,p), 1 ) + 1974 CONTINUE +* + IF ( TRANSP ) THEN + CALL ZLACPY( 'A', N, N, U, LDU, V, LDV ) + END IF +* + ELSE +* +* .. Full SVD .. +* + IF ( .NOT. JRACC ) THEN +* + IF ( .NOT. ALMORT ) THEN +* +* Second Preconditioning Step (QRF [with pivoting]) +* Note that the composition of TRANSPOSE, QRF and TRANSPOSE is +* equivalent to an LQF CALL. Since in many libraries the QRF +* seems to be better optimized than the LQF, we do explicit +* transpose and use the QRF. This is subject to changes in an +* optimized implementation of ZGEJSV. +* + DO 1968 p = 1, NR + CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL ZLACGV( N-p+1, V(p,p), 1 ) + 1968 CONTINUE +* +* .. the following two loops perturb small entries to avoid +* denormals in the second QR factorization, where they are +* as good as zeros. This is done to avoid painfully slow +* computation with denormals. The relative size of the perturbation +* is a parameter that can be changed by the implementer. +* This perturbation device will be obsolete on machines with +* properly implemented arithmetic. +* To switch it off, set L2PERT=.FALSE. To remove it from the +* code, remove the action under L2PERT=.TRUE., leave the ELSE part. +* The following two loops should be blocked and fused with the +* transposed copy above. +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 2969 q = 1, NR + CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO) + DO 2968 p = 1, N + IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) +* $ V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) ) + $ V(p,q) = CTEMP + IF ( p .LT. q ) V(p,q) = - V(p,q) + 2968 CONTINUE + 2969 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV ) + END IF +* +* Estimate the row scaled condition number of R1 +* (If R1 is rectangular, N > NR, then the condition number +* of the leading NR x NR submatrix is estimated.) +* + CALL ZLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR ) + DO 3950 p = 1, NR + TEMP1 = DZNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1) + CALL ZDSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1) + 3950 CONTINUE + CALL ZPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1, + $ CWORK(2*N+NR*NR+1),RWORK,IERR) + CONDR1 = ONE / SQRT(TEMP1) +* .. here need a second opinion on the condition number +* .. then assume worst case scenario +* R1 is OK for inverse <=> CONDR1 .LT. DBLE(N) +* more conservative <=> CONDR1 .LT. SQRT(DBLE(N)) +* + COND_OK = SQRT(SQRT(DBLE(NR))) +*[TP] COND_OK is a tuning parameter. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* .. the second QRF without pivoting. Note: in an optimized +* implementation, this QRF should be implemented as the QRF +* of a lower triangular matrix. +* R1^* = Q2 * R2 + CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL)/EPSLN + DO 3959 p = 2, NR + DO 3958 q = 1, p - 1 + CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) + IF ( ABS(V(q,p)) .LE. TEMP1 ) +* $ V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) ) + $ V(q,p) = CTEMP + 3958 CONTINUE + 3959 CONTINUE + END IF +* + IF ( NR .NE. N ) + $ CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N ) +* .. save ... +* +* .. this transposed copy should be better than naive + DO 1969 p = 1, NR - 1 + CALL ZCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 ) + CALL ZLACGV(NR-p+1, V(p,p), 1 ) + 1969 CONTINUE + V(NR,NR)=CONJG(V(NR,NR)) +* + CONDR2 = CONDR1 +* + ELSE +* +* .. ill-conditioned case: second QRF with pivoting +* Note that windowed pivoting would be equally good +* numerically, and more run-time efficient. So, in +* an optimal implementation, the next call to ZGEQP3 +* should be replaced with eg. CALL ZGEQPX (ACM TOMS #782) +* with properly (carefully) chosen parameters. +* +* R1^* * P2 = Q2 * R2 + DO 3003 p = 1, NR + IWORK(N+p) = 0 + 3003 CONTINUE + CALL ZGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1), + $ CWORK(2*N+1), LWORK-2*N, RWORK, IERR ) +** CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), +** $ LWORK-2*N, IERR ) + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 3969 p = 2, NR + DO 3968 q = 1, p - 1 + CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) + IF ( ABS(V(q,p)) .LE. TEMP1 ) +* $ V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) ) + $ V(q,p) = CTEMP + 3968 CONTINUE + 3969 CONTINUE + END IF +* + CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N ) +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 8970 p = 2, NR + DO 8971 q = 1, p - 1 + CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) +* V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) ) + V(p,q) = - CTEMP + 8971 CONTINUE + 8970 CONTINUE + ELSE + CALL ZLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV ) + END IF +* Now, compute R2 = L3 * Q3, the LQ factorization. + CALL ZGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1), + $ CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR ) +* .. and estimate the condition number + CALL ZLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR ) + DO 4950 p = 1, NR + TEMP1 = DZNRM2( p, CWORK(2*N+N*NR+NR+p), NR ) + CALL ZDSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR ) + 4950 CONTINUE + CALL ZPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1, + $ CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR ) + CONDR2 = ONE / SQRT(TEMP1) +* +* + IF ( CONDR2 .GE. COND_OK ) THEN +* .. save the Householder vectors used for Q3 +* (this overwrites the copy of R2, as it will not be +* needed in this branch, but it does not overwritte the +* Huseholder vectors of Q2.). + CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) +* .. and the rest of the information on Q3 is in +* WORK(2*N+N*NR+1:2*N+N*NR+N) + END IF +* + END IF +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 4968 q = 2, NR + CTEMP = XSC * V(q,q) + DO 4969 p = 1, q - 1 +* V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) ) + V(p,q) = - CTEMP + 4969 CONTINUE + 4968 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV ) + END IF +* +* Second preconditioning finished; continue with Jacobi SVD +* The input matrix is lower trinagular. +* +* Recover the right singular vectors as solution of a well +* conditioned triangular matrix equation. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* + CALL ZGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU, + $ CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK, + $ LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 3970 p = 1, NR + CALL ZCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL ZDSCAL( NR, SVA(p), V(1,p), 1 ) + 3970 CONTINUE + +* .. pick the right matrix equation and solve it +* + IF ( NR .EQ. N ) THEN +* :)) .. best case, R1 is inverted. The solution of this matrix +* equation is Q2*V2 = the product of the Jacobi rotations +* used in ZGESVJ, premultiplied with the orthogonal matrix +* from the second QR factorization. + CALL ZTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV) + ELSE +* .. R1 is well conditioned, but non-square. Adjoint of R2 +* is inverted to get the product of the Jacobi rotations +* used in ZGESVJ. The Q-factor from the second QR +* factorization is then built in explicitly. + CALL ZTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1), + $ N,V,LDV) + IF ( NR .LT. N ) THEN + CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV) + CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL ZUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR) + END IF +* + ELSE IF ( CONDR2 .LT. COND_OK ) THEN +* +* The matrix R2 is inverted. The solution of the matrix equation +* is Q3^* * V3 = the product of the Jacobi rotations (appplied to +* the lower triangular L3 from the LQ factorization of +* R2=L3*Q3), pre-multiplied with the transposed Q3. + CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, + $ LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 3870 p = 1, NR + CALL ZCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL ZDSCAL( NR, SVA(p), U(1,p), 1 ) + 3870 CONTINUE + CALL ZTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N, + $ U,LDU) +* .. apply the permutation from the second QR factorization + DO 873 q = 1, NR + DO 872 p = 1, NR + CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 872 CONTINUE + DO 874 p = 1, NR + U(p,q) = CWORK(2*N+N*NR+NR+p) + 874 CONTINUE + 873 CONTINUE + IF ( NR .LT. N ) THEN + CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) + ELSE +* Last line of defense. +* #:( This is a rather pathological case: no scaled condition +* improvement after two pivoted QR factorizations. Other +* possibility is that the rank revealing QR factorization +* or the condition estimator has failed, or the COND_OK +* is set very close to ONE (which is unnecessary). Normally, +* this branch should never be executed, but in rare cases of +* failure of the RRQR or condition estimator, the last line of +* defense ensures that ZGEJSV completes the task. +* Compute the full SVD of L3 using ZGESVJ with explicit +* accumulation of Jacobi rotations. + CALL ZGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U, + $ LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + IF ( NR .LT. N ) THEN + CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* + CALL ZUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N, + $ CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1), + $ LWORK-2*N-N*NR-NR, IERR ) + DO 773 q = 1, NR + DO 772 p = 1, NR + CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 772 CONTINUE + DO 774 p = 1, NR + U(p,q) = CWORK(2*N+N*NR+NR+p) + 774 CONTINUE + 773 CONTINUE +* + END IF +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = SQRT(DBLE(N)) * EPSLN + DO 1972 q = 1, N + DO 972 p = 1, N + CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 972 CONTINUE + DO 973 p = 1, N + V(p,q) = CWORK(2*N+N*NR+NR+p) + 973 CONTINUE + XSC = ONE / DZNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( N, XSC, V(1,q), 1 ) + 1972 CONTINUE +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). + IF ( NR .LT. M ) THEN + CALL ZLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL ZLASET('A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1),LDU) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* matrix U. This applies to all cases. +* + CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) + +* The columns of U are normalized. The cost is O(M*N) flops. + TEMP1 = SQRT(DBLE(M)) * EPSLN + DO 1973 p = 1, NR + XSC = ONE / DZNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( M, XSC, U(1,p), 1 ) + 1973 CONTINUE +* +* If the initial QRF is computed with row pivoting, the left +* singular vectors must be adjusted. +* + IF ( ROWPIV ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + ELSE +* +* .. the initial matrix A has almost orthogonal columns and +* the second QRF is not needed +* + CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N ) + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 5970 p = 2, N + CTEMP = XSC * CWORK( N + (p-1)*N + p ) + DO 5971 q = 1, p - 1 +* CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) / +* $ ABS(CWORK(N+(p-1)*N+q)) ) + CWORK(N+(q-1)*N+p)=-CTEMP + 5971 CONTINUE + 5970 CONTINUE + ELSE + CALL ZLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N ) + END IF +* + CALL ZGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA, + $ N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK, + $ INFO ) +* + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 6970 p = 1, N + CALL ZCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 ) + CALL ZDSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 ) + 6970 CONTINUE +* + CALL ZTRSM( 'L', 'U', 'N', 'N', N, N, + $ CONE, A, LDA, CWORK(N+1), N ) + DO 6972 p = 1, N + CALL ZCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV ) + 6972 CONTINUE + TEMP1 = SQRT(DBLE(N))*EPSLN + DO 6971 p = 1, N + XSC = ONE / DZNRM2( N, V(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( N, XSC, V(1,p), 1 ) + 6971 CONTINUE +* +* Assemble the left singular vector matrix U (M x N). +* + IF ( N .LT. M ) THEN + CALL ZLASET( 'A', M-N, N, CZERO, CZERO, U(N+1,1), LDU ) + IF ( N .LT. N1 ) THEN + CALL ZLASET('A',N, N1-N, CZERO, CZERO, U(1,N+1),LDU) + CALL ZLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU) + END IF + END IF + CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) + TEMP1 = SQRT(DBLE(M))*EPSLN + DO 6973 p = 1, N1 + XSC = ONE / DZNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( M, XSC, U(1,p), 1 ) + 6973 CONTINUE +* + IF ( ROWPIV ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + END IF +* +* end of the >> almost orthogonal case << in the full SVD +* + ELSE +* +* This branch deploys a preconditioned Jacobi SVD with explicitly +* accumulated rotations. It is included as optional, mainly for +* experimental purposes. It does perform well, and can also be used. +* In this implementation, this branch will be automatically activated +* if the condition number sigma_max(A) / sigma_min(A) is predicted +* to be greater than the overflow threshold. This is because the +* a posteriori computation of the singular vectors assumes robust +* implementation of BLAS and some LAPACK procedures, capable of working +* in presence of extreme values, e.g. when the singular values spread from +* the underflow to the overflow threshold. +* + DO 7968 p = 1, NR + CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL ZLACGV( N-p+1, V(p,p), 1 ) + 7968 CONTINUE +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL/EPSLN) + DO 5969 q = 1, NR + CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO) + DO 5968 p = 1, N + IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) +* $ V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) ) + $ V(p,q) = CTEMP + IF ( p .LT. q ) V(p,q) = - V(p,q) + 5968 CONTINUE + 5969 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV ) + END IF + + CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) + CALL ZLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N ) +* + DO 7969 p = 1, NR + CALL ZCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 ) + CALL ZLACGV( NR-p+1, U(p,p), 1 ) + 7969 CONTINUE + + IF ( L2PERT ) THEN + XSC = SQRT(SMALL/EPSLN) + DO 9970 q = 2, NR + DO 9971 p = 1, q - 1 + CTEMP = DCMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))), + $ ZERO) +* U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) ) + U(p,q) = - CTEMP + 9971 CONTINUE + 9970 CONTINUE + ELSE + CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) + END IF + + CALL ZGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA, + $ N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + + IF ( NR .LT. N ) THEN + CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV ) + END IF + + CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = SQRT(DBLE(N)) * EPSLN + DO 7972 q = 1, N + DO 8972 p = 1, N + CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 8972 CONTINUE + DO 8973 p = 1, N + V(p,q) = CWORK(2*N+N*NR+NR+p) + 8973 CONTINUE + XSC = ONE / DZNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( N, XSC, V(1,q), 1 ) + 7972 CONTINUE +* +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). +* + IF ( NR .LT. M ) THEN + CALL ZLASET( 'A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL ZLASET('A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU) + CALL ZLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU) + END IF + END IF +* + CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* +* + END IF + IF ( TRANSP ) THEN +* .. swap U and V because the procedure worked on A^* + DO 6974 p = 1, N + CALL ZSWAP( N, U(1,p), 1, V(1,p), 1 ) + 6974 CONTINUE + END IF +* + END IF +* end of the full SVD +* +* Undo scaling, if necessary (and possible) +* + IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN + CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR ) + USCAL1 = ONE + USCAL2 = ONE + END IF +* + IF ( NR .LT. N ) THEN + DO 3004 p = NR+1, N + SVA(p) = ZERO + 3004 CONTINUE + END IF +* + RWORK(1) = USCAL2 * SCALEM + RWORK(2) = USCAL1 + IF ( ERREST ) RWORK(3) = SCONDA + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = CONDR1 + RWORK(5) = CONDR2 + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ENTRA + RWORK(7) = ENTRAT + END IF +* + IWORK(1) = NR + IWORK(2) = NUMRANK + IWORK(3) = WARNING + IF ( TRANSP ) THEN + IWORK(4) = 1 + ELSE + IWORK(4) = -1 + END IF + +* + RETURN +* .. +* .. END OF ZGEJSV +* .. + END +* diff --git a/lapack-netlib/zgesvx.f b/lapack-netlib/zgesvx.f new file mode 100644 index 000000000..3b193a1b2 --- /dev/null +++ b/lapack-netlib/zgesvx.f @@ -0,0 +1,602 @@ +*> \brief ZGESVX computes the solution to system of linear equations A * X = B for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGESVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, +* EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, +* WORK, RWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS +* DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ) +* DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), +* $ RWORK( * ) +* COMPLEX*16 A( LDA, * ), AF( LDAF, * ), B( LDB, * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGESVX uses the LU factorization to compute the solution to a complex +*> system of linear equations +*> A * X = B, +*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = P * L * U, +*> where P is a permutation matrix, L is a unit lower triangular +*> matrix, and U is upper triangular. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AF and IPIV contain the factored form of A. +*> If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> A, AF, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AF and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AF and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations: +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the N-by-N matrix A. If FACT = 'F' and EQUED is +*> not 'N', then A must have been equilibrated by the scaling +*> factors in R and/or C. A is not modified if FACT = 'F' or +*> 'N', or if FACT = 'E' and EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] AF +*> \verbatim +*> AF is COMPLEX*16 array, dimension (LDAF,N) +*> If FACT = 'F', then AF is an input argument and on entry +*> contains the factors L and U from the factorization +*> A = P*L*U as computed by ZGETRF. If EQUED .ne. 'N', then +*> AF is the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the equilibrated matrix A (see the description of A for +*> the form of the equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAF +*> \verbatim +*> LDAF is INTEGER +*> The leading dimension of the array AF. LDAF >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = P*L*U +*> as computed by ZGETRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is DOUBLE PRECISION array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,NRHS) +*> On entry, the N-by-NRHS right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is DOUBLE PRECISION +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is DOUBLE PRECISION array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is DOUBLE PRECISION array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,2*N)) +*> On exit, RWORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If RWORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 RWORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization has +*> been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GEsolve +* +* ===================================================================== + SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, + $ EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, + $ WORK, RWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS + DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), + $ RWORK( * ) + COMPLEX*16 A( LDA, * ), AF( LDAF, * ), B( LDB, * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J + DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DLAMCH, ZLANGE, ZLANTR + EXTERNAL LSAME, DLAMCH, ZLANGE, ZLANTR +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGECON, ZGEEQU, ZGERFS, ZGETRF, ZGETRS, + $ ZLACPY, ZLAQGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = DLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -10 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -11 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -12 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -14 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -16 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGESVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL ZGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL ZLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, + $ EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of A. +* + CALL ZLACPY( 'Full', N, N, A, LDA, AF, LDAF ) + CALL ZGETRF( N, N, AF, LDAF, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + RPVGRW = ZLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF, + $ RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ZLANGE( 'M', N, INFO, A, LDA, RWORK ) / + $ RPVGRW + END IF + RWORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = ZLANGE( NORM, N, N, A, LDA, RWORK ) + RPVGRW = ZLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ZLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL ZGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL ZGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL ZGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X, + $ LDX, FERR, BERR, WORK, RWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 80 J = 1, NRHS + DO 70 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 70 CONTINUE + 80 CONTINUE + DO 90 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 90 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 120 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.DLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + RWORK( 1 ) = RPVGRW + RETURN +* +* End of ZGESVX +* + END diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c index 80c66dd7a..5795797d3 100644 --- a/lapack/getf2/getf2_k.c +++ b/lapack/getf2/getf2_k.c @@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, GEMV_N(m - j, j, 0, dm1, a + j, lda, b, 1, b + j, 1, sb); jp = j + IAMAX_K(m - j, b + j, 1); - if (jp>m) jp = m; //avoid out of boundary + if (jp>m) jp = m; //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723 ipiv[j + offset] = jp + offset; jp--; temp1 = *(b + jp); diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c index e3d53c96f..6a2137b3e 100644 --- a/lapack/getf2/zgetf2_k.c +++ b/lapack/getf2/zgetf2_k.c @@ -99,7 +99,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, GEMV_N(m - j, j, 0, dm1, ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb); jp = j + IAMAX_K(m - j, b + j * 2, 1); - if (jp>m) jp = m; //avoid out of boundary + if (jp>m) jp = m; //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723 ipiv[j + offset] = jp + offset; jp--; diff --git a/lapack/laswp/csky/Makefile b/lapack/laswp/csky/Makefile new file mode 100644 index 000000000..75411deb5 --- /dev/null +++ b/lapack/laswp/csky/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile index 71e5a87cb..1c85667ec 100644 --- a/lapack/laswp/loongarch64/Makefile +++ b/lapack/laswp/loongarch64/Makefile @@ -1,6 +1,11 @@ TOPDIR = ../../.. include ../../../Makefile.system +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP LASWP = ../generic/laswp_k.c endif diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c index 1b32e4519..0f4eaefaa 100644 --- a/lapack/lauum/lauum_L_parallel.c +++ b/lapack/lauum/lauum_L_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO, - &newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LC, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = i; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i ) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG))TRMM_LCLN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c index f5ea54c88..77bfeebc7 100644 --- a/lapack/lauum/lauum_U_parallel.c +++ b/lapack/lauum/lauum_U_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UN, sa, sb, args -> nthreads); newarg.m = i; newarg.n = bk; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + ( i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, - &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG))TRMM_RCUN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 986816d1a..7d6bcd776 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + bk + i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))TRSM_RCLN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index cc6ff9912..1f1427276 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))TRSM_LCUN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index 29364cc05..a7c28f4c2 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -80,10 +80,6 @@ static FLOAT dm1 = -1.; #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - #ifndef LOWER #define TRANS #endif diff --git a/openblas.pc.in b/openblas.pc.in index 8ad6e8bee..6c27c462b 100644 --- a/openblas.pc.in +++ b/openblas.pc.in @@ -2,6 +2,6 @@ Name: openblas Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: ${version} URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas${libsuffix} +Libs: -L${libdir} -l$(libprefix}openblas${libnamesuffix} Libs.private: ${extralib} Cflags: -I${includedir} diff --git a/param.h b/param.h index f1f5cbdad..8bdc03380 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2023, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -2600,13 +2600,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 -#if defined(HAVE_GAS) && (HAVE_GAS == 1) -#define DGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_N 4 -#else #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 -#endif #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 @@ -2748,19 +2743,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL -#ifdef HAVE_MSA -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#else +#if defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2772,6 +2755,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #endif #define SGEMM_DEFAULT_P 64 @@ -2845,31 +2840,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL +#if defined(NO_LASX) +#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#else #define DGEMM_DEFAULT_UNROLL_N 4 -#define QGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 16 #define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 16 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#endif + +#define QGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 -#define SGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 -#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_P 256 #define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_R 12288 +#define SGEMM_DEFAULT_R 1024 #define DGEMM_DEFAULT_R 858 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#define SGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 152 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 @@ -2885,13 +2892,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 @@ -2955,19 +2962,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#if defined(HAVE_MSA) -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#else +#if defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2979,6 +2974,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #endif #define SGEMM_DEFAULT_P 128 @@ -3038,6 +3045,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(x280) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2 +#define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2 + +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If VLMAX size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 + +#define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8 +#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 +#define DGEMM_DEFAULT_UNROLL_MN 32 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 32 + +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 160 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef C910V #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 @@ -3077,6 +3130,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef RISCV64_ZVL128B +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + +#ifdef RISCV64_ZVL256B +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 8 + +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 64 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 64 + +#define SGEMM_DEFAULT_R 16384 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 8192 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef ARMV7 #define SNUMOPT 2 #define DNUMOPT 2 @@ -3338,6 +3469,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN1) +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3350,13 +3487,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 @@ -3365,29 +3502,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(NEOVERSEV1) +#elif defined(NEOVERSEV1) // 256-bit SVE -#define SWITCH_RATIO 16 +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif #define SGEMM_DEFAULT_UNROLL_M 16 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 // Actually 2VL (8) but kept separate to keep copies separate +#define DGEMM_DEFAULT_UNROLL_N 8 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 -#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 @@ -3398,6 +3541,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN2) +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + #undef SBGEMM_ALIGN_K #define SBGEMM_ALIGN_K 4 @@ -3433,7 +3582,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) +#elif defined(A64FX) // 512-bit SVE /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ @@ -3474,6 +3623,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16 @@ -3749,7 +3935,44 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SYMV_P 16 #endif +#if defined(CSKY) || defined(CK860FV) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef GENERIC @@ -3838,6 +4061,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #endif +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + #ifndef QGEMM_DEFAULT_UNROLL_M #define QGEMM_DEFAULT_UNROLL_M 2 #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d68b12d87..4ebd5348c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,9 +22,22 @@ if (BUILD_COMPLEX16) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) endif() +if (USE_GEMM3M) + if (BUILD_COMPLEX) + list (APPEND OpenBLAS_Tests cblat3_3m) + endif () + if (BUILD_COMPLEX16) + list (APPEND OpenBLAS_Tests zblat3_3m) + endif () +endif () + foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) +if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") +target_link_libraries(${test_bin} omp pthread) +endif() endforeach() # $1 exec, $2 input, $3 output_result @@ -78,4 +91,10 @@ add_test(NAME "${float_type}blas2" COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) +if (USE_GEMM3M) +if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z")) +add_test(NAME "${float_type}blas3_3m" + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM) +endif() +endif() endforeach() diff --git a/test/Makefile b/test/Makefile index 923f1537c..6a50b6c98 100644 --- a/test/Makefile +++ b/test/Makefile @@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN) override FFLAGS += -fno-tree-vectorize endif +SUPPORT_GEMM3M = 0 + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + ifeq ($(NOFORTRAN),1) all :: else @@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1) D3=dblat3 endif ifeq ($(BUILD_COMPLEX),1) +ifeq ($(SUPPORT_GEMM3M),1) +C3=cblat3 cblat3_3m +else C3=cblat3 endif +endif ifeq ($(BUILD_COMPLEX16),1) +ifeq ($(SUPPORT_GEMM3M),1) +Z3=zblat3 zblat3_3m +else Z3=zblat3 endif +endif + level3: $(B3) $(S3) $(D3) $(C3) $(Z3) @@ -260,17 +287,26 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) +CLDFLAGS = $(CFLAGS) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB = -lomp +CEXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) CEXTRALIB = -lgomp endif +ifeq ($(F_COMPILER), IBM) +ifeq ($(C_COMPILER), GCC) +CEXTRALIB += -lgomp +endif +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB += -lomp +endif +endif endif ifeq ($(BUILD_SINGLE),1) @@ -326,7 +362,7 @@ endif ifeq ($(BUILD_BFLOAT16),1) test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + $(CC) $(CLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_COMPLEX),1) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 276fecae9..bc74233ab 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -32,7 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef union { unsigned short v; +#if defined(_AIX) + struct __attribute__((packed)) +#else struct +#endif { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ unsigned short s:1; @@ -49,7 +53,11 @@ typedef union typedef union { float v; +#if defined(_AIX) + struct __attribute__((packed)) +#else struct +#endif { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ uint32_t s:1; @@ -92,6 +100,8 @@ main (int argc, char *argv[]) float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; float DD[m * n], CC[m * n]; + bfloat16 atmp,btmp; + blasint one=1; for (j = 0; j < m; j++) { @@ -100,16 +110,18 @@ main (int argc, char *argv[]) A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; - AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; - BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; + sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); + AA[j * k + i].v = atmp; + BB[j * k + i].v = btmp; CC[j * k + i] = 0; DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); - SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); + SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, + &m, (bfloat16*)BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 2e32827d3..4771d8a27 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -15,9 +15,76 @@ else () test_dsdot.c test_dnrm2.c test_swap.c + test_zscal.c + test_amin.c + test_axpby.c ) endif () + +set(DIR_EXT test_extensions) +set(OpenBLAS_utest_ext_src +utest_main.c +${DIR_EXT}/common.c +${DIR_EXT}/xerbla.c +${DIR_EXT}/test_isamin.c +${DIR_EXT}/test_idamin.c +${DIR_EXT}/test_icamin.c +${DIR_EXT}/test_izamin.c +${DIR_EXT}/test_ssum.c +${DIR_EXT}/test_dsum.c +${DIR_EXT}/test_scsum.c +${DIR_EXT}/test_dzsum.c +${DIR_EXT}/test_samin.c +${DIR_EXT}/test_damin.c +${DIR_EXT}/test_scamin.c +${DIR_EXT}/test_dzamin.c +${DIR_EXT}/test_scamax.c +${DIR_EXT}/test_dzamax.c +${DIR_EXT}/test_zrotg.c +${DIR_EXT}/test_crotg.c +${DIR_EXT}/test_drotmg.c +${DIR_EXT}/test_srotmg.c +${DIR_EXT}/test_zscal.c +${DIR_EXT}/test_cscal.c +${DIR_EXT}/test_domatcopy.c +${DIR_EXT}/test_somatcopy.c +${DIR_EXT}/test_zomatcopy.c +${DIR_EXT}/test_comatcopy.c +${DIR_EXT}/test_simatcopy.c +${DIR_EXT}/test_dimatcopy.c +${DIR_EXT}/test_cimatcopy.c +${DIR_EXT}/test_zimatcopy.c +${DIR_EXT}/test_sgeadd.c +${DIR_EXT}/test_dgeadd.c +${DIR_EXT}/test_cgeadd.c +${DIR_EXT}/test_zgeadd.c +${DIR_EXT}/test_saxpby.c +${DIR_EXT}/test_daxpby.c +${DIR_EXT}/test_caxpby.c +${DIR_EXT}/test_zaxpby.c +${DIR_EXT}/test_caxpyc.c +${DIR_EXT}/test_zaxpyc.c +${DIR_EXT}/test_cgemv_t.c +${DIR_EXT}/test_zgemv_t.c +${DIR_EXT}/test_cgemv_n.c +${DIR_EXT}/test_zgemv_n.c +${DIR_EXT}/test_crot.c +${DIR_EXT}/test_zrot.c +${DIR_EXT}/test_cgbmv.c +${DIR_EXT}/test_zgbmv.c +${DIR_EXT}/test_dgemmt.c +${DIR_EXT}/test_sgemmt.c +${DIR_EXT}/test_cgemmt.c +${DIR_EXT}/test_zgemmt.c +${DIR_EXT}/test_ztrmv.c +${DIR_EXT}/test_ctrmv.c +${DIR_EXT}/test_ztrsv.c +${DIR_EXT}/test_ctrsv.c +${DIR_EXT}/test_zgemm.c +${DIR_EXT}/test_cgemm.c +) + # crashing on travis cl with an error code suggesting resource not found if (NOT MSVC) set(OpenBLAS_utest_src @@ -46,6 +113,13 @@ set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c ) +set(OpenBLAS_utest_ext_src + ${OpenBLAS_utest_ext_src} + ${DIR_EXT}/test_cspmv.c + ${DIR_EXT}/test_zspmv.c + ${DIR_EXT}/test_csbmv.c + ${DIR_EXT}/test_zsbmv.c + ) if (NOT NO_CBLAS AND NOT NO_LAPACKE) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} @@ -57,7 +131,11 @@ endif() set(OpenBLAS_utest_bin openblas_utest) add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) +set(OpenBLAS_utest_ext_bin openblas_utest_ext) +add_executable(${OpenBLAS_utest_ext_bin} ${OpenBLAS_utest_ext_src}) + target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME}) +target_link_libraries(${OpenBLAS_utest_ext_bin} ${OpenBLAS_LIBNAME}) if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX" ) target_link_libraries(${OpenBLAS_utest_bin} m) @@ -82,3 +160,4 @@ add_custom_command(TARGET ${OpenBLAS_utest_bin} endif() add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin}) +add_test(${OpenBLAS_utest_ext_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin}) diff --git a/utest/Makefile b/utest/Makefile index f99035440..0e3f2b8a7 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,21 +1,39 @@ UTEST_CHECK = 1 TOPDIR = .. +DIR_EXT=test_extensions override TARGET_ARCH= override TARGET_MACH= UTESTBIN=openblas_utest +UTESTEXTBIN=openblas_utest_ext .PHONY : all -.NOTPARALLEL : all run_test $(UTESTBIN) +.NOTPARALLEL : all run_test $(UTESTBIN) $(UTESTEXTBIN) include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ + test_amin.o test_axpby.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o +OBJS_EXT=utest_main.o $(DIR_EXT)/xerbla.o $(DIR_EXT)/common.o +OBJS_EXT+=$(DIR_EXT)/test_isamin.o $(DIR_EXT)/test_idamin.o $(DIR_EXT)/test_icamin.o $(DIR_EXT)/test_izamin.o +OBJS_EXT+=$(DIR_EXT)/test_ssum.o $(DIR_EXT)/test_dsum.o $(DIR_EXT)/test_scsum.o $(DIR_EXT)/test_dzsum.o +OBJS_EXT+=$(DIR_EXT)/test_saxpby.o $(DIR_EXT)/test_daxpby.o $(DIR_EXT)/test_caxpby.o $(DIR_EXT)/test_zaxpby.o $(DIR_EXT)/test_zaxpyc.o $(DIR_EXT)/test_caxpyc.o +OBJS_EXT+=$(DIR_EXT)/test_samin.o $(DIR_EXT)/test_damin.o $(DIR_EXT)/test_scamin.o $(DIR_EXT)/test_dzamin.o $(DIR_EXT)/test_scamax.o $(DIR_EXT)/test_dzamax.o +OBJS_EXT+=$(DIR_EXT)/test_drotmg.o $(DIR_EXT)/test_srotmg.o $(DIR_EXT)/test_zrotg.o $(DIR_EXT)/test_crotg.o $(DIR_EXT)/test_crot.o $(DIR_EXT)/test_zrot.o +OBJS_EXT+=$(DIR_EXT)/test_zscal.o $(DIR_EXT)/test_cscal.o +OBJS_EXT+=$(DIR_EXT)/test_domatcopy.o $(DIR_EXT)/test_somatcopy.o $(DIR_EXT)/test_zomatcopy.o $(DIR_EXT)/test_comatcopy.o +OBJS_EXT+=$(DIR_EXT)/test_simatcopy.o $(DIR_EXT)/test_dimatcopy.o $(DIR_EXT)/test_cimatcopy.o $(DIR_EXT)/test_zimatcopy.o +OBJS_EXT+=$(DIR_EXT)/test_sgeadd.o $(DIR_EXT)/test_dgeadd.o $(DIR_EXT)/test_cgeadd.o $(DIR_EXT)/test_zgeadd.o +OBJS_EXT+=$(DIR_EXT)/test_cgemv_t.o $(DIR_EXT)/test_zgemv_t.o $(DIR_EXT)/test_cgemv_n.o $(DIR_EXT)/test_zgemv_n.o +OBJS_EXT+=$(DIR_EXT)/test_sgemmt.o $(DIR_EXT)/test_dgemmt.o $(DIR_EXT)/test_cgemmt.o $(DIR_EXT)/test_zgemmt.o +OBJS_EXT+=$(DIR_EXT)/test_ztrmv.o $(DIR_EXT)/test_ctrmv.o $(DIR_EXT)/test_ztrsv.o $(DIR_EXT)/test_ctrsv.o +OBJS_EXT+=$(DIR_EXT)/test_zgemm.o $(DIR_EXT)/test_cgemm.o $(DIR_EXT)/test_zgbmv.o $(DIR_EXT)/test_cgbmv.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +OBJS_EXT += $(DIR_EXT)/test_zspmv.o $(DIR_EXT)/test_cspmv.o $(DIR_EXT)/test_zsbmv.o $(DIR_EXT)/test_csbmv.o ifneq ($(NO_CBLAS), 1) ifneq ($(NO_LAPACKE), 1) OBJS += test_kernel_regress.o @@ -44,15 +62,30 @@ endif all : run_test +ifeq ($(OSNAME), AIX) +ifeq ($(USE_OPENMP), 1) +$(UTESTBIN): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) +else +$(UTESTBIN): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) +endif +else $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) +endif + +$(UTESTEXTBIN): $(OBJS_EXT) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -run_test: $(UTESTBIN) +run_test: $(UTESTBIN) $(UTESTEXTBIN) ifneq ($(CROSS), 1) ./$(UTESTBIN) + ./$(UTESTEXTBIN) endif clean: - -rm -f *.o $(UTESTBIN) + -rm -f *.o $(UTESTBIN) $(UTESTEXTBIN) + -rm -f $(DIR_EXT)/*.o -libs: +libs: \ No newline at end of file diff --git a/utest/ctest.h b/utest/ctest.h index 79961badf..b158b4538 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -41,7 +41,7 @@ typedef void (*RunWithDataFunc)(void*); struct ctest { const char* ssname; // suite name const char* ttname; // test name - void (*run)(); + void (*run)(void); int skip; void* data; @@ -159,9 +159,9 @@ struct ctest { void WEAK sname##_teardown(struct sname##_data* data) #define __CTEST_INTERNAL(sname, tname, _skip) \ - void __FNAME(sname, tname)(); \ + void __FNAME(sname, tname)(void); \ __CTEST_STRUCT(sname, tname, _skip, NULL, NULL, NULL) \ - void __FNAME(sname, tname)() + void __FNAME(sname, tname)(void) #ifdef __CTEST_APPLE #define SETUP_FNAME(sname) NULL @@ -366,7 +366,7 @@ void __ctest_addTest(struct ctest *test) #ifndef __CTEST_MSVC /* Add all tests to linked list automatically. */ -static void __ctest_linkTests() +static void __ctest_linkTests(void) { struct ctest ** test; struct ctest ** ctest_begin = (struct ctest **)__PNAME(suite, test); @@ -401,7 +401,7 @@ static void __ctest_linkTests() __ctest_head_p = ctest_begin; } #else //for msvc -static void __ctest_linkTests() +static void __ctest_linkTests(void) { struct ctest ** ctest_start = __ctest_head_p; struct ctest ** test; @@ -450,7 +450,7 @@ static void msg_start(const char* color, const char* title) { print_errormsg(" %s: ", title); } -static void msg_end() { +static void msg_end(void) { if (color_output) { print_errormsg(ANSI_NORMAL); } @@ -634,7 +634,7 @@ static int suite_test_filter(struct ctest* t) { #ifndef __CTEST_NO_TIME -static uint64_t getCurrentTime() { +static uint64_t getCurrentTime(void) { struct timeval now; gettimeofday(&now, NULL); uint64_t now64 = (uint64_t) now.tv_sec; diff --git a/utest/test_amax.c b/utest/test_amax.c index a9e5a1c85..e9775caf0 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2016, The OpenBLAS Project +Copyright (c) 2011-2024, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -57,4 +57,31 @@ CTEST(amax, damax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } #endif +#ifdef BUILD_COMPLEX +CTEST(amax, scamax){ + blasint N = 9, inc = 1; + float te_max = 0.0, tr_max = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + te_max = BLASFUNC(scamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amax, dzamax){ + blasint N = 9, inc = 1; + double te_max = 0.0, tr_max = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_max = BLASFUNC(dzamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} +#endif diff --git a/utest/test_amin.c b/utest/test_amin.c new file mode 100644 index 000000000..1305ab8ef --- /dev/null +++ b/utest/test_amin.c @@ -0,0 +1,89 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(amin, samin){ + blasint N = 3, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(samin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_DOUBLE +CTEST(amin, damin){ + blasint N = 3, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(damin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX +CTEST(amin, scamin){ + blasint N = 9, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(scamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amin, dzamin){ + blasint N = 9, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(dzamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif diff --git a/utest/test_axpby.c b/utest/test_axpby.c new file mode 100644 index 000000000..37ba8ad14 --- /dev/null +++ b/utest/test_axpby.c @@ -0,0 +1,320 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(axpby, saxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + float alpha = 1.0, beta = 2.0; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, saxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + float alpha = 0.25, beta = 0.75; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, saxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + float alpha = 0.25, beta = 0.75; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00, + 9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00, + 7.75, 10.00 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} +#endif + +#ifdef BUILD_DOUBLE +CTEST(axpby, daxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + double alpha = 1.0, beta = 2.0; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, daxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + double alpha = 0.25, beta = 0.75; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, daxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + double alpha = 0.25, beta = 0.75; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00, + 9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00, + 7.75, 10.00 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} +#endif + +#ifdef BUILD_COMPLEX +CTEST(axpby, caxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + float alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, + 10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, caxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5, + 8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, caxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0, + 11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5, + 2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0, + 2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 }; + + for(i = 0; i < 4 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} +#endif + +#ifdef BUILD_COMPLEX16 +CTEST(axpby, zaxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + double alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, + 10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, zaxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5, + 8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, zaxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0, + 11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5, + 2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0, + 2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 }; + + for(i = 0; i < 4 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} +#endif diff --git a/utest/test_axpy.c b/utest/test_axpy.c index 5fd7c1b04..26005e70f 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -74,6 +74,26 @@ CTEST(axpy,zaxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } + +CTEST(axpy,zaxpy_incx_0) +{ + blasint i; + blasint N=4,incX=0,incY=1; + double a[2]={0.25,0.5}; + double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25}; + + //OpenBLAS + BLASFUNC(zaxpy)(&N,a,x1,&incX,y1,&incY); + + for(i=0; i<2*N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + #endif #ifdef BUILD_SINGLE @@ -116,5 +136,24 @@ CTEST(axpy,caxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } + +CTEST(axpy,caxpy_incx_0) +{ + blasint i; + blasint N=4,incX=0,incY=1; + float a[2]={0.25,0.5}; + float x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + float y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25}; + + //OpenBLAS + BLASFUNC(caxpy)(&N,a,x1,&incX,y1,&incY); + + for(i=0; i<2*N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} #endif diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c new file mode 100644 index 000000000..8a6a47795 --- /dev/null +++ b/utest/test_extensions/common.c @@ -0,0 +1,259 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +/** + * Generate random array + */ +void srand_generate(float *alpha, blasint n) +{ + blasint i; + for (i = 0; i < n; i++) + alpha[i] = (float)rand() / (float)RAND_MAX; +} + +void drand_generate(double *alpha, blasint n) +{ + blasint i; + for (i = 0; i < n; i++) + alpha[i] = (double)rand() / (double)RAND_MAX; +} + +/** + * Find difference between two rectangle matrix + * return norm of differences + */ +float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint ld) +{ + blasint i = 0; + blasint j = 0; + blasint inc = 1; + float norm = 0.0f; + + float *a_ptr = a; + float *b_ptr = b; + + for(i = 0; i < rows; i++) + { + for (j = 0; j < cols; j++) { + a_ptr[j] -= b_ptr[j]; + } + norm += cblas_snrm2(cols, a_ptr, inc); + + a_ptr += ld; + b_ptr += ld; + } + return norm/(float)(rows); +} + +double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blasint ld) +{ + blasint i = 0; + blasint j = 0; + blasint inc = 1; + double norm = 0.0; + + double *a_ptr = a; + double *b_ptr = b; + + for(i = 0; i < rows; i++) + { + for (j = 0; j < cols; j++) { + a_ptr[j] -= b_ptr[j]; + } + norm += cblas_dnrm2(cols, a_ptr, inc); + + a_ptr += ld; + b_ptr += ld; + } + return norm/(double)(rows); +} + +/** + * Complex conjugate operation for vector + * + * param n specifies number of elements in vector x + * param inc_x specifies increment of vector x + * param x_ptr specifies buffer holding vector x + */ +void cconjugate_vector(blasint n, blasint inc_x, float *x_ptr) +{ + blasint i; + inc_x *= 2; + + for (i = 0; i < n; i++) + { + x_ptr[1] *= (-1.0f); + x_ptr += inc_x; + } +} + +void zconjugate_vector(blasint n, blasint inc_x, double *x_ptr) +{ + blasint i; + inc_x *= 2; + + for (i = 0; i < n; i++) + { + x_ptr[1] *= (-1.0); + x_ptr += inc_x; + } +} + +/** + * Transpose matrix + * + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param a_src - buffer holding input matrix A + * param lda_src - leading dimension of the matrix A + * param a_dst - buffer holding output matrix A + * param lda_dst - leading dimension of output matrix A + */ +void stranspose(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst) +{ + blasint i, j; + for (i = 0; i != cols; i++) + { + for (j = 0; j != rows; j++) + a_dst[i*lda_dst+j] = alpha*a_src[j*lda_src+i]; + } +} + +void dtranspose(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst) +{ + blasint i, j; + for (i = 0; i != cols; i++) + { + for (j = 0; j != rows; j++) + a_dst[i*lda_dst+j] = alpha*a_src[j*lda_src+i]; + } +} + +void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst, int conj) +{ + blasint i, j; + lda_dst *= 2; + lda_src *= 2; + for (i = 0; i != cols*2; i+=2) + { + for (j = 0; j != rows*2; j+=2){ + a_dst[(i/2)*lda_dst+j] = alpha[0] * a_src[(j/2)*lda_src+i] + conj * alpha[1] * a_src[(j/2)*lda_src+i+1]; + a_dst[(i/2)*lda_dst+j+1] = (-1.0f) * conj * alpha[0] * a_src[(j/2)*lda_src+i+1] + alpha[1] * a_src[(j/2)*lda_src+i]; + } + } +} + +void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst, int conj) +{ + blasint i, j; + lda_dst *= 2; + lda_src *= 2; + for (i = 0; i != cols*2; i+=2) + { + for (j = 0; j != rows*2; j+=2){ + a_dst[(i/2)*lda_dst+j] = alpha[0] * a_src[(j/2)*lda_src+i] + conj * alpha[1] * a_src[(j/2)*lda_src+i+1]; + a_dst[(i/2)*lda_dst+j+1] = (-1.0) * conj * alpha[0] * a_src[(j/2)*lda_src+i+1] + alpha[1] * a_src[(j/2)*lda_src+i]; + } + } +} + +/** + * Copy matrix from source A to destination A + * + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param a_src - buffer holding input matrix A + * param lda_src - leading dimension of the matrix A + * param a_dst - buffer holding output matrix A + * param lda_dst - leading dimension of output matrix A + * param conj specifies conjugation + */ +void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst) +{ + blasint i, j; + for (i = 0; i != rows; i++) + { + for (j = 0; j != cols; j++) + a_dst[i*lda_dst+j] = alpha*a_src[i*lda_src+j]; + } +} + +void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst) +{ + blasint i, j; + for (i = 0; i != rows; i++) + { + for (j = 0; j != cols; j++) + a_dst[i*lda_dst+j] = alpha*a_src[i*lda_src+j]; + } +} + +void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst, int conj) +{ + blasint i, j; + lda_dst *= 2; + lda_src *= 2; + for (i = 0; i != rows; i++) + { + for (j = 0; j != cols*2; j+=2){ + a_dst[i*lda_dst+j] = alpha[0] * a_src[i*lda_src+j] + conj * alpha[1] * a_src[i*lda_src+j+1]; + a_dst[i*lda_dst+j+1] = (-1.0f) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j]; + } + } +} + +void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst, int conj) +{ + blasint i, j; + lda_dst *= 2; + lda_src *= 2; + for (i = 0; i != rows; i++) + { + for (j = 0; j != cols*2; j+=2){ + a_dst[i*lda_dst+j] = alpha[0] * a_src[i*lda_src+j] + conj * alpha[1] * a_src[i*lda_src+j+1]; + a_dst[i*lda_dst+j+1] = (-1.0) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j]; + } + } +} \ No newline at end of file diff --git a/utest/test_extensions/common.h b/utest/test_extensions/common.h new file mode 100644 index 000000000..62b84325c --- /dev/null +++ b/utest/test_extensions/common.h @@ -0,0 +1,76 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#ifndef _TEST_EXTENSION_COMMON_H_ +#define _TEST_EXTENSION_COMMON_H_ + +#include +#include + +#define TRUE 1 +#define FALSE 0 +#define INVALID -1 +#define SINGLE_TOL 1e-02f +#define DOUBLE_TOL 1e-10 + +extern int check_error(void); +extern void set_xerbla(char* current_rout, int expected_info); +extern int BLASFUNC(xerbla)(char *name, blasint *info, blasint length); + +extern void srand_generate(float *alpha, blasint n); +extern void drand_generate(double *alpha, blasint n); + +extern float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint ld); +extern double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blasint ld); + +extern void cconjugate_vector(blasint n, blasint inc_x, float *x_ptr); +extern void zconjugate_vector(blasint n, blasint inc_x, double *x_ptr); + +extern void stranspose(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst); +extern void dtranspose(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst); +extern void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst, int conj); +extern void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst, int conj); + +extern void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst); +extern void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst); +extern void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst, int conj); +extern void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst, int conj); +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_caxpby.c b/utest/test_extensions/test_caxpby.c new file mode 100644 index 000000000..221a48ac7 --- /dev/null +++ b/utest/test_extensions/test_caxpby.c @@ -0,0 +1,631 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CAXPBY { + float x_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; + float y_test[DATASIZE * INCREMENT * 2]; + float y_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CAXPBY data_caxpby; + +/** + * Fortran API specific function + * Test caxpby by comparing it with cscal and caxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static float check_caxpby(blasint n, float *alpha, blasint incx, float *beta, blasint incy) +{ + blasint i; + + // cscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + srand_generate(data_caxpby.x_test, n * incx_abs * 2); + srand_generate(data_caxpby.y_test, n * incy_abs * 2); + + // Copy vector x for caxpy + for (i = 0; i < n * incx_abs * 2; i++) + data_caxpby.x_verify[i] = data_caxpby.x_test[i]; + + // Copy vector y for cscal + for (i = 0; i < n * incy_abs * 2; i++) + data_caxpby.y_verify[i] = data_caxpby.y_test[i]; + + // Find beta*y + BLASFUNC(cscal)(&n, beta, data_caxpby.y_verify, &incy_abs); + + // Find sum of alpha*x and beta*y + BLASFUNC(caxpy)(&n, alpha, data_caxpby.x_verify, &incx, + data_caxpby.y_verify, &incy); + + BLASFUNC(caxpby)(&n, alpha, data_caxpby.x_test, &incx, + beta, data_caxpby.y_test, &incy); + + // Find the differences between output vector caculated by caxpby and caxpy + for (i = 0; i < n * incy_abs * 2; i++) + data_caxpby.y_test[i] -= data_caxpby.y_verify[i]; + + // Find the norm of differences + return BLASFUNC(scnrm2)(&n, data_caxpby.y_test, &incy_abs); +} + +/** + * C API specific function + * Test caxpby by comparing it with cscal and caxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static float c_api_check_caxpby(blasint n, float *alpha, blasint incx, float *beta, blasint incy) +{ + blasint i; + + // cscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + srand_generate(data_caxpby.x_test, n * incx_abs * 2); + srand_generate(data_caxpby.y_test, n * incy_abs * 2); + + // Copy vector x for caxpy + for (i = 0; i < n * incx_abs * 2; i++) + data_caxpby.x_verify[i] = data_caxpby.x_test[i]; + + // Copy vector y for cscal + for (i = 0; i < n * incy_abs * 2; i++) + data_caxpby.y_verify[i] = data_caxpby.y_test[i]; + + // Find beta*y + cblas_cscal(n, beta, data_caxpby.y_verify, incy_abs); + + // Find sum of alpha*x and beta*y + cblas_caxpy(n, alpha, data_caxpby.x_verify, incx, + data_caxpby.y_verify, incy); + + cblas_caxpby(n, alpha, data_caxpby.x_test, incx, + beta, data_caxpby.y_test, incy); + + // Find the differences between output vector caculated by caxpby and caxpy + for (i = 0; i < n * incy_abs * 2; i++) + data_caxpby.y_test[i] -= data_caxpby.y_verify[i]; + + // Find the norm of differences + return cblas_scnrm2(n, data_caxpby.y_test, incy_abs); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(caxpby, inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(caxpby, inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(caxpby, inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(caxpby, inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha[] = {3.0f, 1.0f}; + float beta[] = {4.0f, 3.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(caxpby, inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + float alpha[] = {5.0f, 2.2f}; + float beta[] = {4.0f, 5.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(caxpby, inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {6.0f, 3.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(caxpby, inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + float alpha[] = {7.0f, 2.0f}; + float beta[] = {3.5f, 1.3f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(caxpby, inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(caxpby, inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(caxpby, inc_x_1_inc_y_1_N_100_a_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(caxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(caxpby, check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(caxpby, c_api_inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 2.1f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(caxpby, c_api_inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha[] = {3.0f, 2.0f}; + float beta[] = {4.0f, 3.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(caxpby, c_api_inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + float alpha[] = {5.0f, 2.0f}; + float beta[] = {4.0f, 3.1f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(caxpby, c_api_inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {6.0f, 2.3f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(caxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + float alpha[] = {7.0f, 1.0f}; + float beta[] = {3.5f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_a_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(caxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(caxpby, c_api_check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif diff --git a/utest/test_extensions/test_caxpyc.c b/utest/test_extensions/test_caxpyc.c new file mode 100644 index 000000000..ed1899e57 --- /dev/null +++ b/utest/test_extensions/test_caxpyc.c @@ -0,0 +1,158 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CAXPYC { + float x_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; + float y_test[DATASIZE * INCREMENT * 2]; + float y_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CAXPYC data_caxpyc; + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param incy - increment for the elements of y + * return norm of difference + */ +static float check_caxpyc(blasint n, float *alpha, blasint incx, blasint incy) +{ + blasint i; + + srand_generate(data_caxpyc.x_test, n * incx * 2); + srand_generate(data_caxpyc.y_test, n * incy * 2); + + for (i = 0; i < n * incx * 2; i++) + data_caxpyc.x_verify[i] = data_caxpyc.x_test[i]; + + for (i = 0; i < n * incy * 2; i++) + data_caxpyc.y_verify[i] = data_caxpyc.y_test[i]; + + cconjugate_vector(n, incx, data_caxpyc.x_verify); + + BLASFUNC(caxpy)(&n, alpha, data_caxpyc.x_verify, &incx, + data_caxpyc.y_verify, &incy); + + BLASFUNC(caxpyc)(&n, alpha, data_caxpyc.x_test, &incx, + data_caxpyc.y_test, &incy); + + for (i = 0; i < n * incy * 2; i++) + data_caxpyc.y_verify[i] -= data_caxpyc.y_test[i]; + + return BLASFUNC(scnrm2)(&n, data_caxpyc.y_verify, &incy); +} + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(caxpyc, conj_strides_one) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {5.0f, 2.2f}; + + float norm = check_caxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(caxpyc, conj_incx_one) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {5.0f, 2.2f}; + + float norm = check_caxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(caxpyc, conj_incy_one) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha[] = {5.0f, 2.2f}; + + float norm = check_caxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(caxpyc, conj_strides_two) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha[] = {5.0f, 2.2f}; + + float norm = check_caxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif diff --git a/utest/test_extensions/test_cgbmv.c b/utest/test_extensions/test_cgbmv.c new file mode 100644 index 000000000..8e0640c5d --- /dev/null +++ b/utest/test_extensions/test_cgbmv.c @@ -0,0 +1,279 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 1 + +struct DATA_CGBMV { + float a_test[DATASIZE * DATASIZE * 2]; + float a_band_storage[DATASIZE * DATASIZE * 2]; + float matrix[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * 2 * INCREMENT]; + float c_test[DATASIZE * 2 * INCREMENT]; + float c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CGBMV data_cgbmv; + +/** + * Transform full-storage band matrix A to band-packed storage mode. + * + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * output param a - buffer for holding band-packed matrix + * param lda - specifies the leading dimension of a + * param matrix - buffer holding full-storage band matrix A + * param ldm - specifies the leading full-storage band matrix A + */ +static void transform_to_band_storage(blasint m, blasint n, blasint kl, + blasint ku, float* a, blasint lda, + float* matrix, blasint ldm) +{ + blasint i, j, k; + for (j = 0; j < n; j++) + { + k = 2 * (ku - j); + for (i = MAX(0, 2*(j - ku)); i < MIN(m, j + kl + 1) * 2; i+=2) + { + a[(k + i) + j * lda * 2] = matrix[i + j * ldm * 2]; + a[(k + i) + j * lda * 2 + 1] = matrix[i + j * ldm * 2 + 1]; + } + } +} + +/** + * Generate full-storage band matrix A with kl sub-diagonals and ku super-diagonals + * + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * output param band_matrix - buffer for full-storage band matrix. + * param matrix - buffer holding input general matrix + * param ldm - specifies the leading of input general matrix +*/ +static void get_band_matrix(blasint m, blasint n, blasint kl, blasint ku, + float *band_matrix, float *matrix, blasint ldm) +{ + blasint i, j; + blasint k = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < m * 2; j += 2) + { + if ((blasint)(j/2) > kl + i || i > ku + (blasint)(j/2)) + { + band_matrix[i * ldm * 2 + j] = 0.0f; + band_matrix[i * ldm * 2 + j + 1] = 0.0f; + continue; + } + + band_matrix[i * ldm * 2 + j] = matrix[k++]; + band_matrix[i * ldm * 2 + j + 1] = matrix[k++]; + } + } +} + +/** + * Comapare results computed by cgbmv and cgemv + * since gbmv is gemv for band matrix + * + * param trans specifies op(A), the transposition operation applied to A + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * param alpha - scaling factor for the matrix-vector product + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static float check_cgbmv(char trans, blasint m, blasint n, blasint kl, blasint ku, + float *alpha, blasint lda, blasint inc_b, float *beta, blasint inc_c) +{ + blasint i; + blasint lenb, lenc; + + if(trans == 'T' || trans == 'C' || trans == 'D' || trans == 'U'){ + lenb = m; + lenc = n; + } else { + lenb = n; + lenc = m; + } + + srand_generate(data_cgbmv.matrix, m * n * 2); + srand_generate(data_cgbmv.b_test, 2 * (1 + (lenb - 1) * inc_b)); + srand_generate(data_cgbmv.c_test, 2 * (1 + (lenc - 1) * inc_c)); + + for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++) + data_cgbmv.c_verify[i] = data_cgbmv.c_test[i]; + + get_band_matrix(m, n, kl, ku, data_cgbmv.a_test, data_cgbmv.matrix, m); + + transform_to_band_storage(m, n, kl, ku, data_cgbmv.a_band_storage, lda, data_cgbmv.a_test, m); + + BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgbmv.a_test, &m, data_cgbmv.b_test, + &inc_b, beta, data_cgbmv.c_verify, &inc_c); + + BLASFUNC(cgbmv)(&trans, &m, &n, &kl, &ku, alpha, data_cgbmv.a_band_storage, &lda, data_cgbmv.b_test, + &inc_b, beta, data_cgbmv.c_test, &inc_c); + + for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++) + data_cgbmv.c_verify[i] -= data_cgbmv.c_test[i]; + + return BLASFUNC(scnrm2)(&lenc, data_cgbmv.c_verify, &inc_c); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is D + */ +CTEST(cgbmv, trans_D) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'D'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is O + */ +CTEST(cgbmv, trans_O) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 10; + blasint lda = 50; + char trans = 'O'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is S + */ +CTEST(cgbmv, trans_S) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 6, ku = 9; + blasint lda = 50; + char trans = 'S'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is U + */ +CTEST(cgbmv, trans_U) +{ + blasint m = 25, n = 50; + blasint inc_b = 1, inc_c = 1; + blasint kl = 7, ku = 11; + blasint lda = kl + ku + 1; + char trans = 'U'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is C + */ +CTEST(cgbmv, trans_C) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'C'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is R + */ +CTEST(cgbmv, trans_R) +{ + blasint m = 50, n = 100; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'R'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} +#endif diff --git a/utest/test_extensions/test_cgeadd.c b/utest/test_extensions/test_cgeadd.c new file mode 100644 index 000000000..9b87ad9f3 --- /dev/null +++ b/utest/test_extensions/test_cgeadd.c @@ -0,0 +1,880 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 + +struct DATA_CGEADD { + float a_test[M * N * 2]; + float c_test[M * N * 2]; + float c_verify[M * N * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CGEADD data_cgeadd; + +/** + * cgeadd reference implementation + * + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param aptr - refer to matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param cptr - refer to matrix C + * param ldc - leading dimension of C + */ +static void cgeadd_trusted(blasint m, blasint n, float *alpha, float *aptr, + blasint lda, float *beta, float *cptr, blasint ldc) +{ + blasint i; + + lda *= 2; + ldc *= 2; + + for (i = 0; i < n; i++) + { + cblas_caxpby(m, alpha, aptr, 1, beta, cptr, 1); + aptr += lda; + cptr += ldc; + } +} + +/** + * Test cgeadd by comparing it against reference + * Compare with the following options: + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static float check_cgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, float *alpha, blasint lda, + float *beta, blasint ldc) +{ + blasint i; + blasint cols = m, rows = n; + + if (order == CblasRowMajor) + { + rows = m; + cols = n; + } + + // Fill matrix A, C + srand_generate(data_cgeadd.a_test, lda * rows * 2); + srand_generate(data_cgeadd.c_test, ldc * rows * 2); + + // Copy matrix C for cgeadd + for (i = 0; i < ldc * rows * 2; i++) + data_cgeadd.c_verify[i] = data_cgeadd.c_test[i]; + + cgeadd_trusted(cols, rows, alpha, data_cgeadd.a_test, lda, + beta, data_cgeadd.c_verify, ldc); + + if (api == 'F') + BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda, + beta, data_cgeadd.c_test, &ldc); + else + cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda, + beta, data_cgeadd.c_test, ldc); + + // Find the differences between output matrix caculated by cgeadd and sgemm + return smatrix_difference(data_cgeadd.c_test, data_cgeadd.c_verify, cols, rows, ldc*2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param lda - leading dimension of A + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in cgeadd + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, blasint lda, + blasint ldc, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + set_xerbla("CGEADD ", expected_info); + + if (api == 'F') + BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda, + beta, data_cgeadd.c_test, &ldc); + else + cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda, + beta, data_cgeadd.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(cgeadd, matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {3.0f, 2.0f}; + float beta[] = {1.0f, 3.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(cgeadd, matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.5f, 1.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(cgeadd, matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {3.0f, 1.5f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(cgeadd, matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(cgeadd, matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n - + * number of columns of A and C + * Must be at least zero. + */ +CTEST(cgeadd, xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = m; + blasint ldc = m; + + int expected_info = 2; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific tests + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + */ +CTEST(cgeadd, xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + */ +CTEST(cgeadd, xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + */ +CTEST(cgeadd, xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Check if n - number of columns of A, C equal zero. + */ +CTEST(cgeadd, n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Check if m - number of rows of A and C equal zero. + */ +CTEST(cgeadd, m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 3.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {4.0f, 1.5f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(cgeadd, c_api_matrix_n_50_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N / 2; + blasint m = M; + + blasint lda = n; + blasint ldc = n; + + float alpha[] = {3.0f, 2.5f}; + float beta[] = {1.0f, 2.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {0.0f, 0.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {3.0f, 1.5f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(cgeadd, c_api_matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {2.0f, 3.0f}; + float beta[] = {2.0f, 4.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test error function for an invalid param order - + * specifies whether A and C stored in + * row-major order or column-major order + */ +CTEST(cgeadd, c_api_xerbla_invalid_order) +{ + CBLAS_ORDER order = INVALID; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 0; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(cgeadd, c_api_xerbla_n_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(cgeadd, c_api_xerbla_m_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(cgeadd, c_api_xerbla_lda_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(cgeadd, c_api_xerbla_ldc_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Check if n - number of columns of A, C equal zero. + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Check if m - number of rows of A and C equal zero. + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cgemm.c b/utest/test_extensions/test_cgemm.c new file mode 100644 index 000000000..cd38d710b --- /dev/null +++ b/utest/test_extensions/test_cgemm.c @@ -0,0 +1,273 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CGEMM { + float a_test[DATASIZE * DATASIZE * 2]; + float a_verify[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * DATASIZE * 2]; + float b_verify[DATASIZE * DATASIZE * 2]; + float c_test[DATASIZE * DATASIZE * 2]; + float c_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CGEMM data_cgemm; + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * + * param transa specifies op(A), the transposition (conjugation) operation applied to A + * param transb specifies op(B), the transposition (conjugation) operation applied to B + * param m specifies the number of rows of the matrix op(A) and of the matrix C + * param n specifies the number of columns of the matrix op(B) and the number of columns of the matrix C + * param k specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of matrix A + * param ldb - leading dimension of matrix B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of matrix C + * return norm of difference + */ +static float check_cgemm(char transa, char transb, blasint m, blasint n, blasint k, + float *alpha, blasint lda, blasint ldb, float *beta, blasint ldc) +{ + blasint i; + float alpha_conj[] = {1.0f, 0.0f}; + char transa_verify = transa; + char transb_verify = transb; + + int arows = k, acols = m; + int brows = n, bcols = k; + + if (transa == 'T' || transa == 'C'){ + arows = m; acols = k; + } + + if (transb == 'T' || transb == 'C'){ + brows = k; bcols = n; + } + + srand_generate(data_cgemm.a_test, arows * lda * 2); + srand_generate(data_cgemm.b_test, brows * ldb * 2); + srand_generate(data_cgemm.c_test, n * ldc * 2); + + for (i = 0; i < arows * lda * 2; i++) + data_cgemm.a_verify[i] = data_cgemm.a_test[i]; + + for (i = 0; i < brows * ldb * 2; i++) + data_cgemm.b_verify[i] = data_cgemm.b_test[i]; + + for (i = 0; i < n * ldc * 2; i++) + data_cgemm.c_verify[i] = data_cgemm.c_test[i]; + + if (transa == 'R'){ + cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_cgemm.a_verify, lda, lda); + transa_verify = 'N'; + } + + if (transb == 'R'){ + cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_cgemm.b_verify, ldb, ldb); + transb_verify = 'N'; + } + + BLASFUNC(cgemm)(&transa_verify, &transb_verify, &m, &n, &k, alpha, data_cgemm.a_verify, &lda, + data_cgemm.b_verify, &ldb, beta, data_cgemm.c_verify, &ldc); + + BLASFUNC(cgemm)(&transa, &transb, &m, &n, &k, alpha, data_cgemm.a_test, &lda, + data_cgemm.b_test, &ldb, beta, data_cgemm.c_test, &ldc); + + return smatrix_difference(data_cgemm.c_test, data_cgemm.c_verify, m, n, ldc*2); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and transposed + * matrix B is conjugate and not transposed + */ +CTEST(cgemm, conjtransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'C'; + char transb = 'R'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is not conjugate and not transposed + * matrix B is conjugate and not transposed + */ +CTEST(cgemm, notransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'N'; + char transb = 'R'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is conjugate and transposed + */ +CTEST(cgemm, conjnotransa_conjtransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'C'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is not conjugate and not transposed + */ +CTEST(cgemm, conjnotransa_notransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'N'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is conjugate and not transposed + */ +CTEST(cgemm, conjnotransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'R'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is transposed + */ +CTEST(cgemm, conjnotransa_transb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'T'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is transposed + * matrix B is conjugate and not transposed + */ +CTEST(cgemm, transa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'T'; + char transb = 'R'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cgemmt.c b/utest/test_extensions/test_cgemmt.c new file mode 100644 index 000000000..ed9279933 --- /dev/null +++ b/utest/test_extensions/test_cgemmt.c @@ -0,0 +1,2010 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_CGEMMT { + float a_test[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * DATASIZE * 2]; + float c_test[DATASIZE * DATASIZE * 2]; + float c_verify[DATASIZE * DATASIZE * 2]; + float c_gemm[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CGEMMT data_cgemmt; + +/** + * Compute gemmt via gemm since gemmt is gemm but updates only + * the upper or lower triangular part of the result matrix + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + */ +static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, float *alpha, blasint lda, + blasint ldb, float *beta, blasint ldc) +{ + blasint i, j; + + if(api == 'F') + BLASFUNC(cgemm)(&transa, &transb, &m, &m, &k, alpha, data_cgemmt.a_test, &lda, + data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_gemm, &ldc); + else + cblas_cgemm(order, transa, transb, m, m, k, alpha, data_cgemmt.a_test, lda, + data_cgemmt.b_test, ldb, beta, data_cgemmt.c_gemm, ldc); + + ldc *= 2; + + if (uplo == 'L' || uplo == CblasLower) + { + for (i = 0; i < m; i++) + for (j = i * 2; j < m * 2; j+=2){ + data_cgemmt.c_verify[i * ldc + j] = + data_cgemmt.c_gemm[i * ldc + j]; + data_cgemmt.c_verify[i * ldc + j + 1] = + data_cgemmt.c_gemm[i * ldc + j + 1]; + } + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i * 2; j+=2){ + data_cgemmt.c_verify[i * ldc + j] = + data_cgemmt.c_gemm[i * ldc + j]; + data_cgemmt.c_verify[i * ldc + j + 1] = + data_cgemmt.c_gemm[i * ldc + j + 1]; + } + } +} + +/** + * Comapare results computed by cgemmt and cgemmt_trusted + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static float check_cgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, float *alpha, blasint lda, + blasint ldb, float *beta, blasint ldc) +{ + blasint i; + blasint b_cols; + blasint a_cols; + blasint inc = 1; + blasint size_c = m * ldc * 2; + + if(order == CblasColMajor){ + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = m; + else a_cols = k; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = k; + else b_cols = m; + } else { + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = k; + else a_cols = m; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = m; + else b_cols = k; + } + + srand_generate(data_cgemmt.a_test, a_cols * lda * 2); + srand_generate(data_cgemmt.b_test, b_cols * ldb * 2); + srand_generate(data_cgemmt.c_test, m * ldc * 2); + + for (i = 0; i < m * ldc * 2; i++) + data_cgemmt.c_gemm[i] = data_cgemmt.c_verify[i] = data_cgemmt.c_test[i]; + + cgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc); + + if (api == 'F') + BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test, + &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc); + else + cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda, + data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc); + + for (i = 0; i < m * ldc * 2; i++) + data_cgemmt.c_verify[i] -= data_cgemmt.c_test[i]; + + return BLASFUNC(snrm2)(&size_c, data_cgemmt.c_verify, &inc) / size_c; +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in cgemmt + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, blasint lda, blasint ldb, + blasint ldc, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + set_xerbla("CGEMMT ", expected_info); + + if (api == 'F') + BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test, + &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc); + else + cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda, + data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + char transa = 'N', transb = 'T'; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'U'; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + char transa = 'R', transb = 'R'; + char uplo = 'U'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'R'; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'R', transb = 'C'; + char uplo = 'U'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, upper_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'C'; + char uplo = 'U'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + char transa = 'N', transb = 'T'; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'L'; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + char transa = 'R', transb = 'R'; + char uplo = 'L'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'R'; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'R', transb = 'C'; + char uplo = 'L'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'C'; + char uplo = 'L'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, c_api_colmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, c_api_colmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, c_api_colmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, c_api_colmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 50, ldc = 25; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {-1.0f, -1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 25, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjtrans_b_conjtrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, c_api_rowmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, c_api_rowmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 50, ldc = 25; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 25, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 25, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, c_api_rowmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, c_api_rowmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param uplo. + * Must be upper (U) or lower (L). + */ +CTEST(cgemmt, xerbla_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transa. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(cgemmt, xerbla_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'O', transb = 'N'; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transb. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(cgemmt, xerbla_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'O'; + char uplo = 'U'; + int expected_info = 3; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(cgemmt, xerbla_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 4; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(cgemmt, xerbla_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 5; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(cgemmt, xerbla_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 8; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(cgemmt, xerbla_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 10; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(cgemmt, xerbla_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 13; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. + * Test error function for an invalid param order. + * Must be column or row major. + */ +CTEST(cgemmt, xerbla_c_api_major_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 0; + + int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B transposed. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cgemv_n.c b/utest/test_extensions/test_cgemv_n.c new file mode 100644 index 000000000..60c9af86a --- /dev/null +++ b/utest/test_extensions/test_cgemv_n.c @@ -0,0 +1,340 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CSPMV_N { + float a_test[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * 2 * INCREMENT]; + float c_test[DATASIZE * 2 * INCREMENT]; + float c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CSPMV_N data_cgemv_n; + +/** + * cgemv not transposed reference code + * + * param trans specifies whether matris A is conj or/and xconj + * param m - number of rows of A + * param n - number of columns of A + * param alpha - scaling factor for the matrib-vector product + * param a - buffer holding input matrib A + * param lda - leading dimension of matrix A + * param b - Buffer holding input vector b + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param c - buffer holding input/output vector c + * param inc_c - stride of vector c + */ +static void cgemv_n_trusted(char trans, blasint m, blasint n, float *alpha, float *a, + blasint lda, float *b, blasint inc_b, float *beta, float *c, + blasint inc_c) +{ + blasint i, j; + blasint i2 = 0; + blasint ib = 0, ic = 0; + + float temp_r, temp_i; + + float *a_ptr = a; + blasint lda2 = 2*lda; + + blasint inc_b2 = 2 * inc_b; + blasint inc_c2 = 2 * inc_c; + + BLASFUNC(cscal)(&m, beta, c, &inc_c); + + for (j = 0; j < n; j++) + { + + if (trans == 'N' || trans == 'R') { + temp_r = alpha[0] * b[ib] - alpha[1] * b[ib+1]; + temp_i = alpha[0] * b[ib+1] + alpha[1] * b[ib]; + } else { + temp_r = alpha[0] * b[ib] + alpha[1] * b[ib+1]; + temp_i = alpha[0] * b[ib+1] - alpha[1] * b[ib]; + } + + ic = 0; + i2 = 0; + + for (i = 0; i < m; i++) + { + if (trans == 'N') { + c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; + c[ic+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; + } + if (trans == 'O') { + c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; + c[ic+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; + } + if (trans == 'R') { + c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; + c[ic+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; + } + if (trans == 'S') { + c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; + c[ic+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; + } + i2 += 2; + ic += inc_c2; + } + a_ptr += lda2; + ib += inc_b2; + } + +} + +/** + * Comapare results computed by cgemv and cgemv_n_trusted + * + * param trans specifies whether matris A is conj or/and xconj + * param m - number of rows of A + * param n - number of columns of A + * param alpha - scaling factor for the matrib-vector product + * param lda - leading dimension of matrix A + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static float check_cgemv_n(char trans, blasint m, blasint n, float *alpha, blasint lda, + blasint inc_b, float *beta, blasint inc_c) +{ + blasint i; + + srand_generate(data_cgemv_n.a_test, n * lda); + srand_generate(data_cgemv_n.b_test, 2 * n * inc_b); + srand_generate(data_cgemv_n.c_test, 2 * m * inc_c); + + for (i = 0; i < m * 2 * inc_c; i++) + data_cgemv_n.c_verify[i] = data_cgemv_n.c_test[i]; + + cgemv_n_trusted(trans, m, n, alpha, data_cgemv_n.a_test, lda, data_cgemv_n.b_test, + inc_b, beta, data_cgemv_n.c_test, inc_c); + BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_n.a_test, &lda, data_cgemv_n.b_test, + &inc_b, beta, data_cgemv_n.c_verify, &inc_c); + + for (i = 0; i < m * 2 * inc_c; i++) + data_cgemv_n.c_verify[i] -= data_cgemv_n.c_test[i]; + + return BLASFUNC(scnrm2)(&n, data_cgemv_n.c_verify, &inc_c); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_o_square_matrix) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows of A is 50 + * Number of colums of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_o_rectangular_matrix_rows_less_then_cols) +{ + blasint n = 100, m = 50, lda = 50; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows of A is 100 + * Number of colums of A is 50 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_o_rectangular_matrix_cols_less_then_rows) +{ + blasint n = 50, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(cgemv, trans_o_double_strides) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 2, inc_c = 2; + char trans = 'O'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_s_square_matrix) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows of A is 50 + * Number of colums of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_s_rectangular_matrix_rows_less_then_cols) +{ + blasint n = 100, m = 50, lda = 50; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows of A is 100 + * Number of colums of A is 50 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_s_rectangular_matrix_cols_less_then_rows) +{ + blasint n = 50, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 0.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(cgemv, trans_s_double_strides) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 2, inc_c = 2; + char trans = 'S'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.0f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +#endif diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c new file mode 100644 index 000000000..cb4e5ad9e --- /dev/null +++ b/utest/test_extensions/test_cgemv_t.c @@ -0,0 +1,1132 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 +#define INCREMENT 2 + +struct DATA_CGEMV_T { + float a_test[N * M * 2]; + float a_verify[N * M * 2]; + float y_test[M * INCREMENT * 2]; + float y_verify[M * INCREMENT * 2]; + float x_test[M * INCREMENT * 2]; + float x_verify[M * INCREMENT * 2]; +}; + +// SINGLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * FLT_EPSILON +// SINGLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 1.19e-07 = 5*e-03 +#define SINGLE_EPS_ZGEMV 5e-03 + +#ifdef BUILD_COMPLEX +static struct DATA_CGEMV_T data_cgemv_t; + +/** + * Find product of matrix-vector multiplication + * + * param n specifies number of columns of A + * param m specifies number of rows of A and size of vector x + * param lda specifies leading dimension of A + * param inc_x specifies increment of vector x + */ +static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x) +{ + blasint i; + float *a_ptr = data_cgemv_t.a_verify; + float *x_ptr = data_cgemv_t.x_test; + float *x_res = data_cgemv_t.x_verify; + + openblas_complex_float result; + + for (i = 0; i < n * inc_x; i+= inc_x) + { + result = cblas_cdotu(lda, a_ptr, 1, x_ptr, inc_x); + x_res[0] = CREAL(result); + x_res[1] = CIMAG(result); + a_ptr += lda * 2; + x_res += 2 * inc_x; + } +} + +/** + * Test cgemv by comparing it against comatcopy, caxpby and + * reference func matrix_vector_product + * + * comatcopy perform operation: op(A) + * matrix_vector_product perform operation: A*x + * caxpby perform operation: alpha*x + beta*y + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param alpha specifies scalar alpha + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param beta specifies scalar beta + * param inc_y specifies increment for vector y + * return norm of difference between cgemv and result of reference funcs + */ +static float check_cgemv(char api, char order, char trans, blasint m, blasint n, float *alpha, + blasint lda, blasint inc_x, float *beta, blasint inc_y) +{ + blasint i; + + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + // Transpose parameters for comatcopy + // cgemv_t perform operation on transposed matrix, no need to transpose a_verify + char trans_copy; + char ctrans_copy; + + // Param alpha for comatcopy, scale on alpha perform caxpby + float alpha_one[] = {1.0f, 0.0f}; + + memset(data_cgemv_t.x_verify, 0.0f, m * inc_x * 2 * sizeof(float)); + + // Fill matrix A, vectors x, y + srand_generate(data_cgemv_t.a_test, lda * n * 2); + srand_generate(data_cgemv_t.x_test, m * inc_x * 2); + srand_generate(data_cgemv_t.y_test, m * inc_y * 2); + + // Copy vector y for reference funcs + for (i = 0; i < m * inc_y * 2; i++) { + data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i]; + } + + if (api == 'F') { + if (trans == 'T') trans_copy = 'N'; + if (trans == 'C') trans_copy = 'R'; + if (trans == 'U') trans_copy = 'R'; + if (trans == 'D') trans_copy = 'N'; + + // Perform operation: op(A) + BLASFUNC(comatcopy)(&order, &trans_copy, &m, &n, alpha_one, data_cgemv_t.a_test, &lda, data_cgemv_t.a_verify, &lda); + + // Find A*x + matrix_vector_product(n, m, lda, inc_x); + + // Find conj(x) + if (trans == 'U' || trans == 'D') { + cconjugate_vector(m, inc_x, data_cgemv_t.x_verify); + } + + // Find alpha*x+beta*y + BLASFUNC(caxpby)(&n, alpha, data_cgemv_t.x_verify, &inc_x, beta, data_cgemv_t.y_verify, &inc_y); + + BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_t.a_test, + &lda, data_cgemv_t.x_test, &inc_x, beta, data_cgemv_t.y_test, &inc_y); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') {ctrans = CblasTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasTrans : CblasNoTrans;} + if (trans == 'N') {ctrans = CblasNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasNoTrans : CblasTrans;} + if (trans == 'C') {ctrans = CblasConjTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjTrans : CblasConjNoTrans;} + if (trans == 'R') {ctrans = CblasConjNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjNoTrans : CblasConjTrans;} + + // Perform operation: op(A) + cblas_comatcopy(corder, ctrans_copy, m, n, alpha_one, data_cgemv_t.a_test, lda, data_cgemv_t.a_verify, lda); + + // Find A*x + matrix_vector_product(n, m, lda, inc_x); + + // Find alpha*x+beta*y + cblas_caxpby(n, alpha, data_cgemv_t.x_verify, inc_x, beta, data_cgemv_t.y_verify, inc_y); + + cblas_cgemv(corder, ctrans, m, n, alpha, data_cgemv_t.a_test, + lda, data_cgemv_t.x_test, inc_x, beta, data_cgemv_t.y_test, inc_y); + } + + // Find the differences between output vector caculated by cgemv and reference funcs + for (i = 0; i < m * inc_y * 2; i++) + data_cgemv_t.y_test[i] -= data_cgemv_t.y_verify[i]; + + // Find the norm of differences + return cblas_scnrm2(m, data_cgemv_t.y_test, inc_y); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param inc_y specifies increment for vector y + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint m, blasint n, + blasint lda, blasint inc_x, blasint inc_y, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float a[] = {1.0f, 1.0f}; + float x[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + float y[] = {1.0f, 1.0f}; + + set_xerbla("CGEMV ", expected_info); + + BLASFUNC(cgemv)(&trans, &m, &n, alpha, a, &lda, x, &inc_x, beta, y, &inc_y); + + return check_error(); +} + +/** + * C API specific function + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param inc_y specifies increment for vector y + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int c_api_check_badargs(CBLAS_ORDER corder, CBLAS_TRANSPOSE ctrans, blasint m, blasint n, + blasint lda, blasint inc_x, blasint inc_y, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float a[] = {1.0f, 1.0f}; + float x[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + float y[] = {1.0f, 1.0f}; + + set_xerbla("CGEMV ", expected_info); + + cblas_cgemv(corder, ctrans, m, n, alpha, a, lda, x, inc_x, beta, y, inc_y); + + return check_error(); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 2.0f + */ +CTEST(cgemv, colmajor_trans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 2.0f + */ +CTEST(cgemv, colmajor_trans_col_100_row_100_inc_x_2_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 2.0f}; + + blasint inc_x = 2; + blasint inc_y = 1; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 1.0f + */ +CTEST(cgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 2 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 1.0f + */ +CTEST(cgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + blasint inc_x = 1; + blasint inc_y = 2; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and x conjugate + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 1.0f + */ +CTEST(cgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and x conjugate + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 1.0f, alpha_i = 2.0f + * beta_r = 1.0f, beta_i = 1.0f + */ +CTEST(cgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_2_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'U'; + + float alpha[] = {1.0f, 2.0f}; + float beta[] = {1.0f, 1.0f}; + + blasint inc_x = 2; + blasint inc_y = 2; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition, conjugate A, conjugate x + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 2.0f + */ +CTEST(cgemv, colmajor_conjtrans_x_conj_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'D'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 2; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition, conjugate A, conjugate x + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 2.0f + */ +CTEST(cgemv, c_api_colmajor_trans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 2.0f + */ +CTEST(cgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 2 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 2.0f + */ +CTEST(cgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 2; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Row Major + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 1.0f + */ +CTEST(cgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'N'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Row Major + * No trans + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 3.0f, beta_i = 2.0f + */ +CTEST(cgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_2_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'N'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {3.0f, 1.0f}; + + blasint inc_x = 2; + blasint inc_y = 2; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Conjugate + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 1.0f, alpha_i = 3.0f + * beta_r = 1.0f, beta_i = 2.5f + */ +CTEST(cgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'R'; + + float alpha[] = {1.0f, 3.0f}; + float beta[] = {1.0f, 2.5f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Row Major + * Conjugate + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 1.5f + */ +CTEST(cgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_2_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'R'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.5f}; + + blasint inc_x = 2; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_inc_y) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_inc_y_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_inc_y_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_inc_x) +{ + char order = 'C'; + char trans = 'T'; + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_inc_x_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_inc_x_row_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_n) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_n_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_n_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_m) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_m_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_m_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * lda must be at least n. + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_lda) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda. + * If matrices are stored using col major layout, + * lda must be at least m. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_lda_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda. + * If matrices are stored using col major layout, + * lda must be at least n. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_lda_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param trans. + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_trans) +{ + char order = 'C'; + char trans = 'Z'; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param trans. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_trans_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = INVALID; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param trans. + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_trans_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = INVALID; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param order. + */ +CTEST(cgemv, c_api_xerbla_invalid_order_col_major) +{ + enum CBLAS_ORDER corder = INVALID; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 0; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c new file mode 100644 index 000000000..a4b1e30ac --- /dev/null +++ b/utest/test_extensions/test_cimatcopy.c @@ -0,0 +1,818 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_CIMATCOPY { + float a_test[DATASIZE * DATASIZE * 2]; + float a_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CIMATCOPY data_cimatcopy; + +/** + * Comapare results computed by cimatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * return norm of difference between openblas and reference func + */ +static float check_cimatcopy(char api, char order, char trans, blasint rows, blasint cols, float *alpha, + blasint lda_src, blasint lda_dst) +{ + blasint m, n; + blasint rows_out, cols_out; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + int conj = -1; + + if (order == 'C') { + n = rows; m = cols; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + rows_out = n; cols_out = m*2; + if (trans == 'C') + conj = 1; + } + else { + rows_out = m; cols_out = n*2; + if (trans == 'R') + conj = 1; + } + + srand_generate(data_cimatcopy.a_test, lda_src*m*2); + + if (trans == 'T' || trans == 'C') { + ctranspose(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); + } + else { + ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); + } + + if (api == 'F') { + BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, + &lda_src, &lda_dst); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_cimatcopy(corder, ctrans, rows, cols, alpha, data_cimatcopy.a_test, + lda_src, lda_dst); + } + + // Find the differences between output matrix computed by cimatcopy and reference func + return smatrix_difference(data_cimatcopy.a_test, data_cimatcopy.a_verify, cols_out, rows_out, 2*lda_dst); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda_src, blasint lda_dst, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + + set_xerbla("CIMATCOPY", expected_info); + + BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, + &lda_src, &lda_dst); + + return check_error(); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha_r = -3.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha[] = {-3.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, colmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific tests + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, colmajor_conjtrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'C'; + float alpha[] = {1.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_conj_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 50; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha_r = 2.0, alpha_i = 3.0 + */ +CTEST(cimatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha[] = {2.0f, 3.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific tests + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific tests + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, rowmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(cimatcopy, rowmajor_conj_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 50; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, rowmajor_conjtrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha_r = 3.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha[] = {3.0f, 2.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha_r = 3.0, alpha_i = 1.5 + */ +CTEST(cimatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha[] = {3.0f, 1.5f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha_r = 3.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha[] = {3.0f, 1.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, c_api_colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, c_api_colmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha_r = 1.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha[] = {1.0f, 1.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(cimatcopy, c_api_rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, c_api_rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(cimatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(cimatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using row major layout, + * lda_src must be at least n. + */ +CTEST(cimatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using column major layout, + * lda_src must be at least m. + */ +CTEST(cimatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is no transposition, lda_dst must be at least n. + */ +CTEST(cimatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is transposition, lda_dst must be at least m. + */ +CTEST(cimatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is no transposition, lda_dst must be at least m. + */ +CTEST(cimatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is transposition, lda_dst must be at least n. + */ +CTEST(cimatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c new file mode 100644 index 000000000..71663406a --- /dev/null +++ b/utest/test_extensions/test_comatcopy.c @@ -0,0 +1,696 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_COMATCOPY { + float a_test[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * DATASIZE * 2]; + float b_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_COMATCOPY data_comatcopy; + +/** + * Comapare results computed by comatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param alpha - scaling factor for matrix B + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * return norm of difference between openblas and reference func + */ +static float check_comatcopy(char api, char order, char trans, blasint rows, blasint cols, float* alpha, + blasint lda, blasint ldb) +{ + blasint b_rows, b_cols; + blasint m, n; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + int conj = -1; + + if (order == 'C') { + m = cols; n = rows; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + b_rows = n; b_cols = m*2; + if (trans == 'C') + conj = 1; + } + else { + b_rows = m; b_cols = n*2; + if (trans == 'R') + conj = 1; + } + + srand_generate(data_comatcopy.a_test, lda*m*2); + + if (trans == 'T' || trans == 'C') { + ctranspose(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); + } + else { + ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); + } + + if (api == 'F') { + BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, + &lda, data_comatcopy.b_test, &ldb); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_comatcopy(corder, ctrans, rows, cols, alpha, data_comatcopy.a_test, + lda, data_comatcopy.b_test, ldb); + } + + return smatrix_difference(data_comatcopy.b_test, data_comatcopy.b_verify, b_cols, b_rows, ldb*2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda, blasint ldb, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + + set_xerbla("COMATCOPY", expected_info); + + BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, + &lda, data_comatcopy.b_test, &ldb); + + return check_error(); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * alpha_r = -1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + float alpha[] = {-1.0f, 2.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(comatcopy, colmajor_conjtrnas_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, c_api_colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * alpha_r = -1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, c_api_colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + float alpha[] = {-1.0f, 2.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, c_api_colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(comatcopy, c_api_colmajor_conjtrnas_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, c_api_rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, c_api_rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, c_api_rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, c_api_rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(comatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(comatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using row major layout, + * lda must be at least n. + */ +CTEST(comatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using column major layout, + * lda must be at least m. + */ +CTEST(comatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is no transposition, ldb must be at least n. + */ +CTEST(comatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is transposition, ldb must be at least m. + */ +CTEST(comatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is no transposition, ldb must be at least n. + */ +CTEST(comatcopy, xerbla_rowmajor_conj_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'R'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is transposition, ldb must be at least m. + */ +CTEST(comatcopy, xerbla_rowmajor_transconj_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'C'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is no transposition, ldb must be at least m. + */ +CTEST(comatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is transposition, ldb must be at least n. + */ +CTEST(comatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is no transposition, ldb must be at least m. + */ +CTEST(comatcopy, xerbla_colmajor_conj_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'R'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is transposition, ldb must be at least n. + */ +CTEST(comatcopy, xerbla_colmajor_transconj_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'C'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_crot.c b/utest/test_extensions/test_crot.c new file mode 100644 index 000000000..1c55216d9 --- /dev/null +++ b/utest/test_extensions/test_crot.c @@ -0,0 +1,792 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CROT { + float x_test[DATASIZE * INCREMENT * 2]; + float y_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; + float y_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CROT data_crot; + +/** + * Fortran API specific function + * Comapare results computed by csrot and caxpby + * + * param n specifies size of vector x + * param inc_x specifies increment of vector x + * param inc_y specifies increment of vector y + * param c specifies cosine + * param s specifies sine + * return norm of differences + */ +static float check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, float *s) +{ + blasint i; + float norm = 0; + float s_neg[] = {-s[0], s[1]}; + + blasint inc_x_abs = labs(inc_x); + blasint inc_y_abs = labs(inc_y); + + // Fill vectors x, y + srand_generate(data_crot.x_test, n * inc_x_abs * 2); + srand_generate(data_crot.y_test, n * inc_y_abs * 2); + + if (inc_x == 0 && inc_y == 0) { + srand_generate(data_crot.x_test, n * 2); + srand_generate(data_crot.y_test, n * 2); + } + + // Copy vector x for caxpby + for (i = 0; i < n * inc_x_abs * 2; i++) + data_crot.x_verify[i] = data_crot.x_test[i]; + + // Copy vector y for caxpby + for (i = 0; i < n * inc_y_abs * 2; i++) + data_crot.y_verify[i] = data_crot.y_test[i]; + + // Find cx = c*x + s*y + BLASFUNC(caxpby)(&n, s, data_crot.y_test, &inc_y, c, data_crot.x_verify, &inc_x); + + // Find cy = -conjg(s)*x + c*y + BLASFUNC(caxpby)(&n, s_neg, data_crot.x_test, &inc_x, c, data_crot.y_verify, &inc_y); + + BLASFUNC(csrot)(&n, data_crot.x_test, &inc_x, data_crot.y_test, &inc_y, c, s); + + // Find the differences between vector x caculated by caxpby and csrot + for (i = 0; i < n * 2 * inc_x_abs; i++) + data_crot.x_test[i] -= data_crot.x_verify[i]; + + // Find the differences between vector y caculated by caxpby and csrot + for (i = 0; i < n * 2 * inc_y_abs; i++) + data_crot.y_test[i] -= data_crot.y_verify[i]; + + // Find the norm of differences + norm += BLASFUNC(scnrm2)(&n, data_crot.x_test, &inc_x_abs); + norm += BLASFUNC(scnrm2)(&n, data_crot.y_test, &inc_y_abs); + return (norm / 2); +} + +/** + * C API specific function + * Comapare results computed by csrot and caxpby + * + * param n specifies size of vector x + * param inc_x specifies increment of vector x + * param inc_y specifies increment of vector y + * param c specifies cosine + * param s specifies sine + * return norm of differences + */ +static float c_api_check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, float *s) +{ + blasint i; + float norm = 0; + float s_neg[] = {-s[0], s[1]}; + + blasint inc_x_abs = labs(inc_x); + blasint inc_y_abs = labs(inc_y); + + // Fill vectors x, y + srand_generate(data_crot.x_test, n * inc_x_abs * 2); + srand_generate(data_crot.y_test, n * inc_y_abs * 2); + + if (inc_x == 0 && inc_y == 0) { + srand_generate(data_crot.x_test, n * 2); + srand_generate(data_crot.y_test, n * 2); + } + + // Copy vector x for caxpby + for (i = 0; i < n * inc_x_abs * 2; i++) + data_crot.x_verify[i] = data_crot.x_test[i]; + + // Copy vector y for caxpby + for (i = 0; i < n * inc_y_abs * 2; i++) + data_crot.y_verify[i] = data_crot.y_test[i]; + + // Find cx = c*x + s*y + cblas_caxpby(n, s, data_crot.y_test, inc_y, c, data_crot.x_verify, inc_x); + + // Find cy = -conjg(s)*x + c*y + cblas_caxpby(n, s_neg, data_crot.x_test, inc_x, c, data_crot.y_verify, inc_y); + + cblas_csrot(n, data_crot.x_test, inc_x, data_crot.y_test, inc_y, c[0], s[0]); + + // Find the differences between vector x caculated by caxpby and csrot + for (i = 0; i < n * 2 * inc_x_abs; i++) + data_crot.x_test[i] -= data_crot.x_verify[i]; + + // Find the differences between vector y caculated by caxpby and csrot + for (i = 0; i < n * 2 * inc_y_abs; i++) + data_crot.y_test[i] -= data_crot.y_verify[i]; + + // Find the norm of differences + norm += cblas_scnrm2(n, data_crot.x_test, inc_x_abs); + norm += cblas_scnrm2(n, data_crot.y_test, inc_y_abs); + return (norm / 2); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 0 + * Stride of vector y is 0 + * c = 1.0f + * s = 2.0f + */ +CTEST(crot, inc_x_0_inc_y_0) +{ + blasint n = 100; + + blasint inc_x = 0; + blasint inc_y = 0; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_1_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is -1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_neg_1_inc_y_neg_1) +{ + blasint n = 100; + + blasint inc_x = -1; + blasint inc_y = -1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * c = 3.0f + * s = 2.0f + */ +CTEST(crot, inc_x_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {3.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_neg_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_1_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is -2 + * c = 2.0f + * s = 1.0f + */ +CTEST(crot, inc_x_1_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = -2; + + // Imaginary part for caxpby + float c[] = {2.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 2.0f + */ +CTEST(crot, inc_x_2_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_neg_2_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = -2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 0.0f + * s = 1.0f + */ +CTEST(crot, inc_x_2_inc_y_2_c_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {0.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 0.0f + */ +CTEST(crot, inc_x_2_inc_y_2_s_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {0.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 0 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, check_n_zero) +{ + blasint n = 0; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 0 + * Stride of vector y is 0 + * c = 1.0f + * s = 2.0f + */ +CTEST(crot, c_api_inc_x_0_inc_y_0) +{ + blasint n = 100; + + blasint inc_x = 0; + blasint inc_y = 0; + + // Imaginary part for caxpby + float c[] = {3.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_1_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is -1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_neg_1_inc_y_neg_1) +{ + blasint n = 100; + + blasint inc_x = -1; + blasint inc_y = -1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * c = 3.0f + * s = 2.0f + */ +CTEST(crot, c_api_inc_x_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {3.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_neg_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_1_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is -2 + * c = 2.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_1_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = -2; + + // Imaginary part for caxpby + float c[] = {2.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 2.0f + */ +CTEST(crot, c_api_inc_x_2_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_neg_2_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = -2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 0.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_2_inc_y_2_c_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {0.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 0.0f + */ +CTEST(crot, c_api_inc_x_2_inc_y_2_s_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {0.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 0 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_check_n_zero) +{ + blasint n = 0; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_crotg.c b/utest/test_extensions/test_crotg.c new file mode 100644 index 000000000..84875ccf7 --- /dev/null +++ b/utest/test_extensions/test_crotg.c @@ -0,0 +1,290 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#ifdef BUILD_COMPLEX + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, zero_a) +{ + float sa[2] = {0.0f, 0.0f}; + float sb[2] = {1.0f, 1.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.70711f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, zero_b) +{ + float sa[2] = {1.0f, 1.0f}; + float sb[2] = {0.0f, 0.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(1.0f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, zero_real) +{ + float sa[2] = {0.0f, 1.0f}; + float sb[2] = {0.0f, 1.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.70711f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, positive_real_positive_img) +{ + float sa[2] = {3.0f, 4.0f}; + float sb[2] = {4.0f, 6.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, negative_real_positive_img) +{ + float sa[2] = {-3.0f, 4.0f}; + float sb[2] = {-4.0f, 6.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, positive_real_negative_img) +{ + float sa[2] = {3.0f, -4.0f}; + float sb[2] = {4.0f, -6.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, negative_real_negative_img) +{ + float sa[2] = {-3.0f, -4.0f}; + float sb[2] = {-4.0f, -6.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_zero_a) +{ + float sa[2] = {0.0f, 0.0f}; + float sb[2] = {1.0f, 1.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.70711f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_zero_b) +{ + float sa[2] = {1.0f, 1.0f}; + float sb[2] = {0.0f, 0.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(1.0f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_zero_real) +{ + float sa[2] = {0.0f, 1.0f}; + float sb[2] = {0.0f, 1.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.70711f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_positive_real_positive_img) +{ + float sa[2] = {3.0f, 4.0f}; + float sb[2] = {4.0f, 6.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_negative_real_positive_img) +{ + float sa[2] = {-3.0f, 4.0f}; + float sb[2] = {-4.0f, 6.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_positive_real_negative_img) +{ + float sa[2] = {3.0f, -4.0f}; + float sb[2] = {4.0f, -6.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_negative_real_negative_img) +{ + float sa[2] = {-3.0f, -4.0f}; + float sb[2] = {-4.0f, -6.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_csbmv.c b/utest/test_extensions/test_csbmv.c new file mode 100644 index 000000000..41c24a2b7 --- /dev/null +++ b/utest/test_extensions/test_csbmv.c @@ -0,0 +1,606 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CSBMV { + float sp_matrix[DATASIZE * (DATASIZE + 1)]; + float sb_matrix[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * 2 * INCREMENT]; + float c_test[DATASIZE * 2 * INCREMENT]; + float c_verify[DATASIZE * 2 * INCREMENT]; +}; + +// SINGLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * FLT_EPSILON +// SINGLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 1.19e-07 = 5*e-03 +#define SINGLE_EPS_ZGEMV 5e-03 + +#ifdef BUILD_COMPLEX +static struct DATA_CSBMV data_csbmv; + +/** + * Transform full-storage symmetric band matrix A to upper (U) or lower (L) + * band-packed storage mode. + * + * param uplo specifies whether matrix a is upper or lower band-packed. + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * output param a - buffer for holding symmetric band-packed matrix + * param lda - specifies the leading dimension of a + * param sb_matrix - buffer holding full-storage symmetric band matrix A + * param ldm - specifies the leading dimension of A + */ +static void transform_to_band_storage(char uplo, blasint n, blasint k, float* a, blasint lda, + float* sb_matrix, blasint ldm) +{ + blasint i, j, m; + if (uplo == 'L') { + for (j = 0; j < n; j++) + { + m = -j; + for (i = 2 * j; i < MIN(2 * n, 2 * (j + k + 1)); i += 2) + { + a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2]; + a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2]; + } + } + } + else { + for (j = 0; j < n; j++) + { + m = k - j; + for (i = MAX(0, 2*(j - k)); i <= j*2; i += 2) + { + a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2]; + a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2]; + } + } + } +} + +/** + * Generate full-storage symmetric band matrix A with k - super-diagonals + * from input symmetric packed matrix in lower packed mode (L) + * + * output param sb_matrix - buffer for holding full-storage symmetric band matrix. + * param sp_matrix - buffer holding input symmetric packed matrix + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + */ +static void get_symmetric_band_matr(float *sb_matrix, float *sp_matrix, blasint n, blasint k) +{ + blasint m; + blasint i, j; + m = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < n * 2; j += 2) + { + // Make matrix band with k super-diagonals + if (fabs((i+1) - ceil((j+1)/2.0f)) > k) + { + sb_matrix[i * n * 2 + j] = 0.0f; + sb_matrix[i * n * 2 + j + 1] = 0.0f; + continue; + } + + if (j / 2 < i) + { + sb_matrix[i * n * 2 + j] = + sb_matrix[j * n + i * 2]; + sb_matrix[i * n * 2 + j + 1] = + sb_matrix[j * n + i * 2 + 1]; + } + else + { + sb_matrix[i * n * 2 + j] = sp_matrix[m++]; + sb_matrix[i * n * 2 + j + 1] = sp_matrix[m++]; + } + } + } +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether matrix a is upper or lower band-packed. + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b_test + * param inc_c - stride of vector c_test + * param expected_info - expected invalid parameter number in csbmv + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char uplo, blasint n, blasint k, blasint lda, blasint inc_b, + blasint inc_c, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float a[2]; + srand_generate(a, 2); + + set_xerbla("CSBMV ", expected_info); + + BLASFUNC(csbmv)(&uplo, &n, &k, alpha, a, &lda, data_csbmv.b_test, + &inc_b, beta, data_csbmv.c_test, &inc_c); + + return check_error(); +} + +/** + * Comapare results computed by csbmv and cgemv + * since csbmv is cgemv for symmetric band matrix + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * param alpha - scaling factor for the matrix-vector product + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b_test + * param beta - scaling factor for vector c_test + * param inc_c - stride of vector c_test + * param lda - specifies the leading dimension of a + * return norm of differences + */ +static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint lda, + blasint inc_b, float *beta, blasint inc_c, blasint ldm) +{ + blasint i; + + // Trans param for gemv (can use any, since the input matrix is symmetric) + char trans = 'N'; + + // Symmetric band packed matrix for sbmv + float *a = (float*) malloc(lda * n * 2 * sizeof(float)); + + // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test + srand_generate(data_csbmv.sp_matrix, n * (n + 1)); + srand_generate(data_csbmv.b_test, n * inc_b * 2); + srand_generate(data_csbmv.c_test, n * inc_c * 2); + + // Copy vector c_test for cgemv + for (i = 0; i < n * inc_c * 2; i++) + data_csbmv.c_verify[i] = data_csbmv.c_test[i]; + + // Generate full-storage symmetric band matrix + // with k super-diagonals from symmetric packed matrix + get_symmetric_band_matr(data_csbmv.sb_matrix, data_csbmv.sp_matrix, n, k); + + // Transform symmetric band matrix from conventional + // full matrix storage to band storage for csbmv + transform_to_band_storage(uplo, n, k, a, lda, data_csbmv.sb_matrix, ldm); + + BLASFUNC(cgemv)(&trans, &n, &n, alpha, data_csbmv.sb_matrix, &ldm, data_csbmv.b_test, + &inc_b, beta, data_csbmv.c_verify, &inc_c); + + BLASFUNC(csbmv)(&uplo, &n, &k, alpha, a, &lda, + data_csbmv.b_test, &inc_b, beta, data_csbmv.c_test, &inc_c); + + // Find the differences between output vector caculated by csbmv and cgemv + for (i = 0; i < n * inc_c * 2; i++) + data_csbmv.c_test[i] -= data_csbmv.c_verify[i]; + free(a); + // Find the norm of differences + return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 0 + */ +CTEST(csbmv, upper_k_0_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 1 + */ +CTEST(csbmv, upper_k_1_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 1; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, upper_k_2_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, upper_k_2_inc_b_2_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 2 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, upper_k_2_inc_b_2_inc_c_2_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 2; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 0 + */ +CTEST(csbmv, lower_k_0_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 1 + */ +CTEST(csbmv, lower_k_1_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 1; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, lower_k_2_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, lower_k_2_inc_b_2_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 2 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, lower_k_2_inc_b_2_inc_c_2_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 2; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Check if output matrix a contains any NaNs + */ +CTEST(csbmv, check_for_NaN) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + + ASSERT_TRUE(norm == norm); /* NaN == NaN is false */ +} + +/** + * Test error function for an invalid param uplo. + * Uplo specifies whether a is in upper (U) or lower (L) band-packed storage mode. + */ +CTEST(csbmv, xerbla_uplo_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'O'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 1; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param N - + * number of rows and columns of A. Must be at least zero. + */ +CTEST(csbmv, xerbla_n_invalid) +{ + blasint n = INVALID, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 2; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Check if n - number of rows and columns of A equal zero. + */ +CTEST(csbmv, check_n_zero) +{ + blasint n = 0, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = 1; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test error function for an invalid param inc_b - + * stride of vector b_test. Can't be zero. + */ +CTEST(csbmv, xerbla_inc_b_zero) +{ + blasint n = 1, inc_b = 0, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 8; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_c - + * stride of vector c_test. Can't be zero. + */ +CTEST(csbmv, xerbla_inc_c_zero) +{ + blasint n = 1, inc_b = 1, inc_c = 0; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 11; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param k - + * number of super-diagonals of A. Must be at least zero. + */ +CTEST(csbmv, xerbla_k_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = INVALID; + blasint lda = 1; + int expected_info = 3; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda - + * specifies the leading dimension of a. Must be at least (k+1). + */ +CTEST(csbmv, xerbla_lda_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = INVALID; + int expected_info = 6; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_cscal.c b/utest/test_extensions/test_cscal.c new file mode 100644 index 000000000..009c600ad --- /dev/null +++ b/utest/test_extensions/test_cscal.c @@ -0,0 +1,164 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CSCAL { + float x_test[DATASIZE * 2 * INCREMENT]; + float x_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CSCAL data_cscal; + +/** + * cscal reference code + * + * param n - number of elements of vector x + * param alpha - scaling factor for the vector product + * param x - buffer holding input vector x + * param inc - stride of vector x + */ +static void cscal_trusted(blasint n, float *alpha, float* x, blasint inc){ + blasint i, ip = 0; + blasint inc_x2 = 2 * inc; + float temp; + for (i = 0; i < n; i++) + { + temp = alpha[0] * x[ip] - alpha[1] * x[ip+1]; + x[ip+1] = alpha[0] * x[ip+1] + alpha[1] * x[ip]; + x[ip] = temp; + ip += inc_x2; + } +} + +/** + * Comapare results computed by cscal and cscal_trusted + * + * param api specifies tested api (C or Fortran) + * param n - number of elements of vector x + * param alpha - scaling factor for the vector product + * param inc - stride of vector x + * return norm of differences + */ +static float check_cscal(char api, blasint n, float *alpha, blasint inc) +{ + blasint i; + + // Fill vectors a + srand_generate(data_cscal.x_test, n * inc * 2); + + // Copy vector x for cscal_trusted + for (i = 0; i < n * 2 * inc; i++) + data_cscal.x_verify[i] = data_cscal.x_test[i]; + + cscal_trusted(n, alpha, data_cscal.x_verify, inc); + + if(api == 'F') + BLASFUNC(cscal)(&n, alpha, data_cscal.x_test, &inc); + else + cblas_cscal(n, alpha, data_cscal.x_test, inc); + + // Find the differences between output vector computed by cscal and cscal_trusted + for (i = 0; i < n * 2 * inc; i++) + data_cscal.x_verify[i] -= data_cscal.x_test[i]; + + // Find the norm of differences + return BLASFUNC(scnrm2)(&n, data_cscal.x_verify, &inc); +} + +/** + * Fortran API specific test + * Test cscal by comparing it against reference + */ +CTEST(cscal, alpha_r_zero_alpha_i_not_zero) +{ + blasint N = DATASIZE; + blasint inc = 1; + float alpha[2] = {0.0f, 1.0f}; + + float norm = check_cscal('F', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cscal by comparing it against reference + */ +CTEST(cscal, alpha_r_zero_alpha_i_zero_inc_2) +{ + blasint N = DATASIZE; + blasint inc = 2; + float alpha[2] = {0.0f, 0.0f}; + + float norm = check_cscal('F', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cscal by comparing it against reference + */ +CTEST(cscal, c_api_alpha_r_zero_alpha_i_not_zero) +{ + blasint N = DATASIZE; + blasint inc = 1; + float alpha[2] = {0.0f, 1.0f}; + + float norm = check_cscal('C', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cscal by comparing it against reference + */ +CTEST(cscal, c_api_alpha_r_zero_alpha_i_zero_inc_2) +{ + blasint N = DATASIZE; + blasint inc = 2; + float alpha[2] = {0.0f, 0.0f}; + + float norm = check_cscal('C', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cspmv.c b/utest/test_extensions/test_cspmv.c new file mode 100644 index 000000000..b64c90e3a --- /dev/null +++ b/utest/test_extensions/test_cspmv.c @@ -0,0 +1,428 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CSPMV { + float a_verify[DATASIZE * DATASIZE * 2]; + float a_test[DATASIZE * (DATASIZE + 1)]; + float b_test[DATASIZE * 2 * INCREMENT]; + float c_test[DATASIZE * 2 * INCREMENT]; + float c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CSPMV data_cspmv; + +/** + * Compute spmv via gemv since spmv is gemv for symmetric packed matrix + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param alpha - scaling factor for the matrix-vector product + * param a - buffer holding input matrix A + * param b - Buffer holding input vector b + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param c - buffer holding input/output vector c + * param inc_c - stride of vector c + * output param data_cspmv.c_verify - matrix computed by gemv + */ +static void cspmv_trusted(char uplo, blasint n, float *alpha, float *a, + float *b, blasint inc_b, float *beta, float *c, + blasint inc_c) +{ + blasint k; + blasint i, j; + + // param for gemv (can use any, since the input matrix is symmetric) + char trans = 'N'; + + // Unpack the input symmetric packed matrix + if (uplo == 'L') + { + k = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < n * 2; j += 2) + { + if (j / 2 < i) + { + data_cspmv.a_verify[i * n * 2 + j] = + data_cspmv.a_verify[j * n + i * 2]; + data_cspmv.a_verify[i * n * 2 + j + 1] = + data_cspmv.a_verify[j * n + i * 2 + 1]; + } + else + { + data_cspmv.a_verify[i * n * 2 + j] = a[k++]; + data_cspmv.a_verify[i * n * 2 + j + 1] = a[k++]; + } + } + } + } + else + { + k = n * (n + 1) - 1; + for (j = 2 * n - 1; j >= 0; j -= 2) + { + for (i = n - 1; i >= 0; i--) + { + if (j / 2 < i) + { + data_cspmv.a_verify[i * n * 2 + j] = + data_cspmv.a_verify[(j - 1) * n + i * 2 + 1]; + data_cspmv.a_verify[i * n * 2 + j - 1] = + data_cspmv.a_verify[(j - 1) * n + i * 2]; + } + else + { + data_cspmv.a_verify[i * n * 2 + j] = a[k--]; + data_cspmv.a_verify[i * n * 2 + j - 1] = a[k--]; + } + } + } + } + + // Run gemv with the unpacked matrix + BLASFUNC(cgemv)(&trans, &n, &n, alpha, data_cspmv.a_verify, &n, b, + &inc_b, beta, data_cspmv.c_verify, &inc_c); +} + +/** + * Comapare results computed by cspmv and cspmv_trusted + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param alpha - scaling factor for the matrix-vector product + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static float check_cspmv(char uplo, blasint n, float *alpha, blasint inc_b, + float *beta, blasint inc_c) +{ + blasint i; + + // Fill symmetric packed maxtix a, vectors b and c + srand_generate(data_cspmv.a_test, n * (n + 1)); + srand_generate(data_cspmv.b_test, 2 * n * inc_b); + srand_generate(data_cspmv.c_test, 2 * n * inc_c); + + // Copy vector c for cspmv_trusted + for (i = 0; i < n * 2 * inc_c; i++) + data_cspmv.c_verify[i] = data_cspmv.c_test[i]; + + cspmv_trusted(uplo, n, alpha, data_cspmv.a_test, data_cspmv.b_test, + inc_b, beta, data_cspmv.c_verify, inc_c); + + BLASFUNC(cspmv)(&uplo, &n, alpha, data_cspmv.a_test, data_cspmv.b_test, + &inc_b, beta, data_cspmv.c_test, &inc_c); + + // Find the differences between output vector computed by cspmv and cspmv_trusted + for (i = 0; i < n * 2 * inc_c; i++) + data_cspmv.c_test[i] -= data_cspmv.c_verify[i]; + + // Find the norm of differences + return BLASFUNC(scnrm2)(&n, data_cspmv.c_test, &inc_c); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param inc_b - stride of vector b + * param inc_c - stride of vector c + * param expected_info - expected invalid parameter number in cspmv + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char uplo, blasint n, blasint inc_b, + blasint inc_c, int expected_info) +{ + float alpha[] = {1.0, 1.0}; + float beta[] = {0.0, 0.0}; + + set_xerbla("CSPMV ", expected_info); + + BLASFUNC(cspmv)(&uplo, &n, alpha, data_cspmv.a_test, data_cspmv.b_test, + &inc_b, beta, data_cspmv.c_test, &inc_c); + + return check_error(); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cspmv, upper_inc_b_1_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 2 + */ +CTEST(cspmv, upper_inc_b_1_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 2; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 1 + */ +CTEST(cspmv, upper_inc_b_2_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 1; + char uplo = 'U'; + float alpha[] = {1.0f, 0.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(cspmv, upper_inc_b_2_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 2; + char uplo = 'U'; + float alpha[] = {2.5, -2.1}; + float beta[] = {0.0f, 1.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cspmv, lower_inc_b_1_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 2 + */ +CTEST(cspmv, lower_inc_b_1_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 2; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 1 + */ +CTEST(cspmv, lower_inc_b_2_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 1; + char uplo = 'L'; + float alpha[] = {1.0f, 0.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(cspmv, lower_inc_b_2_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 2; + char uplo = 'L'; + float alpha[] = {2.5, -2.1}; + float beta[] = {0.0f, 1.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Check if output matrix A contains any NaNs + */ +CTEST(cspmv, check_for_NaN) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_TRUE(norm == norm); /* NaN == NaN is false */ +} + +/** + * Test error function for an invalid param uplo. + * uplo specifies whether A is upper or lower triangular. + */ +CTEST(cspmv, xerbla_uplo_invalid) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param N - + * number of rows and columns of A. Must be at least zero. + */ +CTEST(cspmv, xerbla_N_invalid) +{ + blasint N = INVALID, inc_b = 1, inc_c = 1; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_b - + * stride of vector b. Can't be zero. + */ +CTEST(cspmv, xerbla_inc_b_zero) +{ + blasint N = DATASIZE, inc_b = 0, inc_c = 1; + char uplo = 'U'; + int expected_info = 6; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_c - + * stride of vector c. Can't be zero. + */ +CTEST(cspmv, xerbla_inc_c_zero) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 0; + char uplo = 'U'; + int expected_info = 9; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_ctrmv.c b/utest/test_extensions/test_ctrmv.c new file mode 100644 index 000000000..2a3f27416 --- /dev/null +++ b/utest/test_extensions/test_ctrmv.c @@ -0,0 +1,266 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 300 +#define INCREMENT 2 + +struct DATA_CTRMV { + float a_test[DATASIZE * DATASIZE * 2]; + float a_verify[DATASIZE * DATASIZE * 2]; + float x_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CTRMV data_ctrmv; + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * + * param uplo specifies whether A is upper or lower triangular + * param trans specifies op(A), the transposition (conjugation) operation applied to A + * param diag specifies whether the matrix A is unit triangular or not. + * param n - numbers of rows and columns of A + * param lda - leading dimension of matrix A + * param incx - increment for the elements of x + * return norm of difference + */ +static float check_ctrmv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx) +{ + blasint i; + float alpha_conj[] = {1.0f, 0.0f}; + char trans_verify = trans; + + srand_generate(data_ctrmv.a_test, n * lda * 2); + srand_generate(data_ctrmv.x_test, n * incx * 2); + + for (i = 0; i < n * lda * 2; i++) + data_ctrmv.a_verify[i] = data_ctrmv.a_test[i]; + + for (i = 0; i < n * incx * 2; i++) + data_ctrmv.x_verify[i] = data_ctrmv.x_test[i]; + + if (trans == 'R'){ + cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ctrmv.a_verify, lda, lda); + trans_verify = 'N'; + } + + BLASFUNC(ctrmv)(&uplo, &trans_verify, &diag, &n, data_ctrmv.a_verify, &lda, + data_ctrmv.x_verify, &incx); + + BLASFUNC(ctrmv)(&uplo, &trans, &diag, &n, data_ctrmv.a_test, &lda, + data_ctrmv.x_test, &incx); + + for (i = 0; i < n * incx * 2; i++) + data_ctrmv.x_verify[i] -= data_ctrmv.x_test[i]; + + return BLASFUNC(scnrm2)(&n, data_ctrmv.x_verify, &incx); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + */ +CTEST(ctrmv, conj_notrans_upper_not_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + */ +CTEST(ctrmv, conj_notrans_upper_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + */ +CTEST(ctrmv, conj_notrans_lower_not_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + */ +CTEST(ctrmv, conj_notrans_lower_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ctrmv, conj_notrans_upper_not_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ctrmv, conj_notrans_upper_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ctrmv, conj_notrans_lower_not_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ctrmv, conj_notrans_lower_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_ctrsv.c b/utest/test_extensions/test_ctrsv.c new file mode 100644 index 000000000..0e639bb2a --- /dev/null +++ b/utest/test_extensions/test_ctrsv.c @@ -0,0 +1,267 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 300 +#define INCREMENT 2 + +struct DATA_CTRSV { + float a_test[DATASIZE * DATASIZE * 2]; + float a_verify[DATASIZE * DATASIZE * 2]; + float x_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CTRSV data_ctrsv; + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * + * param uplo specifies whether A is upper or lower triangular + * param trans specifies op(A), the transposition (conjugation) operation applied to A + * param diag specifies whether the matrix A is unit triangular or not. + * param n - numbers of rows and columns of A + * param lda - leading dimension of matrix A + * param incx - increment for the elements of x + * return norm of difference + */ +static float check_ctrsv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx) +{ + blasint i; + float alpha_conj[] = {1.0f, 0.0f}; + char trans_verify = trans; + + srand_generate(data_ctrsv.a_test, n * lda * 2); + srand_generate(data_ctrsv.x_test, n * incx * 2); + + for (i = 0; i < n * lda * 2; i++) + data_ctrsv.a_verify[i] = data_ctrsv.a_test[i]; + + for (i = 0; i < n * incx * 2; i++) + data_ctrsv.x_verify[i] = data_ctrsv.x_test[i]; + + if (trans == 'R'){ + cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, + alpha_conj, data_ctrsv.a_verify, lda, lda); + trans_verify = 'N'; + } + + BLASFUNC(ctrsv)(&uplo, &trans_verify, &diag, &n, data_ctrsv.a_verify, + &lda, data_ctrsv.x_verify, &incx); + + BLASFUNC(ctrsv)(&uplo, &trans, &diag, &n, data_ctrsv.a_test, &lda, + data_ctrsv.x_test, &incx); + + for (i = 0; i < n * incx * 2; i++) + data_ctrsv.x_verify[i] -= data_ctrsv.x_test[i]; + + return BLASFUNC(scnrm2)(&n, data_ctrsv.x_verify, &incx); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + */ +CTEST(ctrsv, conj_notrans_upper_not_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + */ +CTEST(ctrsv, conj_notrans_upper_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + */ +CTEST(ctrsv, conj_notrans_lower_not_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + */ +CTEST(ctrsv, conj_notrans_lower_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ctrsv, conj_notrans_upper_not_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ctrsv, conj_notrans_upper_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ctrsv, conj_notrans_lower_not_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ctrsv, conj_notrans_lower_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c new file mode 100644 index 000000000..736921fa3 --- /dev/null +++ b/utest/test_extensions/test_damin.c @@ -0,0 +1,354 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_DOUBLE + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N; i ++) { + x[i] = i - 1000; + } + x[8] = 0.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(fabs(x[0]), amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.1}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.1, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, 1.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.1, 1.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.1, 1.0, -2.2}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 1.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_daxpby.c b/utest/test_extensions/test_daxpby.c new file mode 100644 index 000000000..6e77c7c7c --- /dev/null +++ b/utest/test_extensions/test_daxpby.c @@ -0,0 +1,799 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_DAXPBY{ + double x_test[DATASIZE * INCREMENT]; + double x_verify[DATASIZE * INCREMENT]; + double y_test[DATASIZE * INCREMENT]; + double y_verify[DATASIZE * INCREMENT]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DAXPBY data_daxpby; + +/** + * Fortran API specific function + * Test daxpby by comparing it with dscal and daxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static double check_daxpby(blasint n, double alpha, blasint incx, double beta, blasint incy) +{ + blasint i; + + // dscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + drand_generate(data_daxpby.x_test, n * incx_abs); + drand_generate(data_daxpby.y_test, n * incy_abs); + + // Copy vector x for daxpy + for (i = 0; i < n * incx_abs; i++) + data_daxpby.x_verify[i] = data_daxpby.x_test[i]; + + // Copy vector y for dscal + for (i = 0; i < n * incy_abs; i++) + data_daxpby.y_verify[i] = data_daxpby.y_test[i]; + + // Find beta*y + BLASFUNC(dscal)(&n, &beta, data_daxpby.y_verify, &incy_abs); + + // Find sum of alpha*x and beta*y + BLASFUNC(daxpy)(&n, &alpha, data_daxpby.x_verify, &incx, + data_daxpby.y_verify, &incy); + + BLASFUNC(daxpby)(&n, &alpha, data_daxpby.x_test, &incx, + &beta, data_daxpby.y_test, &incy); + + // Find the differences between output vector caculated by daxpby and daxpy + for (i = 0; i < n * incy_abs; i++) + data_daxpby.y_test[i] -= data_daxpby.y_verify[i]; + + // Find the norm of differences + return BLASFUNC(dnrm2)(&n, data_daxpby.y_test, &incy_abs); +} + +/** + * C API specific function + * Test daxpby by comparing it with dscal and daxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static double c_api_check_daxpby(blasint n, double alpha, blasint incx, double beta, blasint incy) +{ + blasint i; + + // dscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Copy vector x for daxpy + for (i = 0; i < n * incx_abs; i++) + data_daxpby.x_verify[i] = data_daxpby.x_test[i]; + + // Copy vector y for dscal + for (i = 0; i < n * incy_abs; i++) + data_daxpby.y_verify[i] = data_daxpby.y_test[i]; + + // Find beta*y + cblas_dscal(n, beta, data_daxpby.y_verify, incy_abs); + + // Find sum of alpha*x and beta*y + cblas_daxpy(n, alpha, data_daxpby.x_verify, incx, + data_daxpby.y_verify, incy); + + cblas_daxpby(n, alpha, data_daxpby.x_test, incx, + beta, data_daxpby.y_test, incy); + + // Find the differences between output vector caculated by daxpby and daxpy + for (i = 0; i < n * incy_abs; i++) + data_daxpby.y_test[i] -= data_daxpby.y_verify[i]; + + // Find the norm of differences + return cblas_dnrm2(n, data_daxpby.y_test, incy_abs); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(daxpby, inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(daxpby, inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(daxpby, inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(daxpby, inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha = 3.0; + double beta = 4.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(daxpby, inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + double alpha = 5.0; + double beta = 4.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(daxpby, inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + double alpha = 1.0; + double beta = 6.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(daxpby, inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + double alpha = 7.0; + double beta = 3.5; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(daxpby, inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 0.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero +*/ +CTEST(daxpby, inc_x_1_inc_y_2_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 0.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(daxpby, inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * Scalar beta is zero +*/ +CTEST(daxpby, inc_x_2_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(daxpby, inc_x_1_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(daxpby, inc_x_2_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(daxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 0.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(daxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 0.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(daxpby, check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(daxpby, c_api_inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha = 2.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 1.0; + double beta = 2.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(daxpby, c_api_inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha = 3.0; + double beta = 4.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(daxpby, c_api_inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + double alpha = 5.0; + double beta = 4.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(daxpby, c_api_inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + double alpha = 1.0; + double beta = 6.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(daxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + double alpha = 7.0; + double beta = 3.5; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 0.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero +*/ +CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 0.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * Scalar beta is zero +*/ +CTEST(daxpby, c_api_inc_x_2_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha = 1.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 1.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(daxpby, c_api_inc_x_2_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha = 1.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 0.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 0.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(daxpby, c_api_check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dgeadd.c b/utest/test_extensions/test_dgeadd.c new file mode 100644 index 000000000..8f93a842e --- /dev/null +++ b/utest/test_extensions/test_dgeadd.c @@ -0,0 +1,878 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 + +struct DATA_DGEADD{ + double a_test[M * N]; + double c_test[M * N]; + double c_verify[M * N]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DGEADD data_dgeadd; + +/** + * dgeadd reference implementation + * + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param aptr - refer to matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param cptr - refer to matrix C + * param ldc - leading dimension of C + */ +static void dgeadd_trusted(blasint m, blasint n, double alpha, double *aptr, + blasint lda, double beta, double *cptr, blasint ldc) +{ + blasint i; + + for (i = 0; i < n; i++) + { + cblas_daxpby(m, alpha, aptr, 1, beta, cptr, 1); + aptr += lda; + cptr += ldc; + } +} + +/** + * Test dgeadd by comparing it against reference + * Compare with the following options: + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static double check_dgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, double alpha, blasint lda, + double beta, blasint ldc) +{ + blasint i; + blasint cols = m, rows = n; + + if (order == CblasRowMajor) + { + rows = m; + cols = n; + } + + // Fill matrix A, C + drand_generate(data_dgeadd.a_test, lda * rows); + drand_generate(data_dgeadd.c_test, ldc * rows); + + // Copy matrix C for dgeadd + for (i = 0; i < ldc * rows; i++) + data_dgeadd.c_verify[i] = data_dgeadd.c_test[i]; + + dgeadd_trusted(cols, rows, alpha, data_dgeadd.a_test, lda, + beta, data_dgeadd.c_verify, ldc); + + if (api == 'F') + BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda, + &beta, data_dgeadd.c_test, &ldc); + else + cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda, + beta, data_dgeadd.c_test, ldc); + + // Find the differences between output matrix caculated by dgeadd and sgemm + return dmatrix_difference(data_dgeadd.c_test, data_dgeadd.c_verify, cols, rows, ldc); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param lda - leading dimension of A + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in dgeadd + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, blasint lda, + blasint ldc, int expected_info) +{ + double alpha = 1.0; + double beta = 1.0; + + set_xerbla("DGEADD ", expected_info); + + if (api == 'F') + BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda, + &beta, data_dgeadd.c_test, &ldc); + else + cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda, + beta, data_dgeadd.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(dgeadd, matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 3.0; + double beta = 3.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(dgeadd, matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 0.0; + double beta = 2.5; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(dgeadd, matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 3.0; + double beta = 0.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(dgeadd, matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 0.0; + double beta = 0.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(dgeadd, matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n - + * number of columns of A and C + * Must be at least zero. + */ +CTEST(dgeadd, xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = m; + blasint ldc = m; + + int expected_info = 2; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + */ +CTEST(dgeadd, xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + */ +CTEST(dgeadd, xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + */ +CTEST(dgeadd, xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Check if n - number of columns of A, C equal zero. + */ +CTEST(dgeadd, n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Check if m - number of rows of A and C equal zero. + */ +CTEST(dgeadd, m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 2.0; + double beta = 3.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 4.0; + double beta = 2.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(dgeadd, c_api_matrix_n_50_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N / 2; + blasint m = M; + + blasint lda = n; + blasint ldc = n; + + double alpha = 3.0; + double beta = 1.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 0.0; + double beta = 1.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 3.0; + double beta = 0.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 0.0; + double beta = 0.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(dgeadd, c_api_matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + double alpha = 3.0; + double beta = 4.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test error function for an invalid param order - + * specifies whether A and C stored in + * row-major order or column-major order + */ +CTEST(dgeadd, c_api_xerbla_invalid_order) +{ + CBLAS_ORDER order = INVALID; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 0; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(dgeadd, c_api_xerbla_n_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(dgeadd, c_api_xerbla_m_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(dgeadd, c_api_xerbla_lda_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(dgeadd, c_api_xerbla_ldc_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Check if n - number of columns of A, C equal zero. + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Check if m - number of rows of A and C equal zero. + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dgemmt.c b/utest/test_extensions/test_dgemmt.c new file mode 100644 index 000000000..22dcaf2aa --- /dev/null +++ b/utest/test_extensions/test_dgemmt.c @@ -0,0 +1,1442 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_DGEMMT { + double a_test[DATASIZE * DATASIZE]; + double b_test[DATASIZE * DATASIZE]; + double c_test[DATASIZE * DATASIZE]; + double c_verify[DATASIZE * DATASIZE]; + double c_gemm[DATASIZE * DATASIZE]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DGEMMT data_dgemmt; + +/** + * Compute gemmt via gemm since gemmt is gemm but updates only + * the upper or lower triangular part of the result matrix + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + */ +static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, double alpha, blasint lda, + blasint ldb, double beta, blasint ldc) +{ + blasint i, j; + + if(api == 'F') + BLASFUNC(dgemm)(&transa, &transb, &m, &m, &k, &alpha, data_dgemmt.a_test, &lda, + data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_gemm, &ldc); + else + cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda, + data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc); + + if (uplo == 'L' || uplo == CblasLower) + { + for (i = 0; i < m; i++) + for (j = i; j < m; j++) + data_dgemmt.c_verify[i * ldc + j] = + data_dgemmt.c_gemm[i * ldc + j]; + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i; j++) + data_dgemmt.c_verify[i * ldc + j] = + data_dgemmt.c_gemm[i * ldc + j]; + } +} + +/** + * Comapare results computed by dgemmt and dgemmt_trusted + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static double check_dgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, double alpha, blasint lda, + blasint ldb, double beta, blasint ldc) +{ + blasint i; + blasint b_cols; + blasint a_cols; + blasint inc = 1; + blasint size_c = m * ldc; + + if(order == CblasColMajor){ + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = m; + else a_cols = k; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = k; + else b_cols = m; + } else { + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = k; + else a_cols = m; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = m; + else b_cols = k; + } + + drand_generate(data_dgemmt.a_test, a_cols * lda); + drand_generate(data_dgemmt.b_test, b_cols * ldb); + drand_generate(data_dgemmt.c_test, m * ldc); + + for (i = 0; i < m * ldc; i++) + data_dgemmt.c_gemm[i] = data_dgemmt.c_verify[i] = data_dgemmt.c_test[i]; + + dgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc); + + if (api == 'F') + BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test, + &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc); + else + cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda, + data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc); + + for (i = 0; i < m * ldc; i++) + data_dgemmt.c_verify[i] -= data_dgemmt.c_test[i]; + + return BLASFUNC(dnrm2)(&size_c, data_dgemmt.c_verify, &inc) / size_c; +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in dgemmt + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, blasint lda, blasint ldb, + blasint ldc, int expected_info) +{ + double alpha = 1.0; + double beta = 0.0; + + set_xerbla("DGEMMT ", expected_info); + + if (api == 'F') + BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test, + &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc); + else + cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda, + data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'T'; + char uplo = 'U'; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'U'; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(dgemmt, upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + char transa = 'T', transb = 'N'; + char uplo = 'L'; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'T'; + char uplo = 'L'; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'L'; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(dgemmt, lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, c_api_colmajor_upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, c_api_colmajor_upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, c_api_colmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(dgemmt, c_api_colmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, c_api_colmajor_lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, c_api_colmajor_lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, c_api_colmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(dgemmt, c_api_colmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, c_api_rowmajor_upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 100; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 100, ldb = 100, ldc = 50; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, c_api_rowmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(dgemmt, c_api_rowmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, c_api_rowmajor_lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 100; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 100, ldb = 100, ldc = 50; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, c_api_rowmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(dgemmt, c_api_rowmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param uplo. + * Must be upper (U) or lower (L). + */ +CTEST(dgemmt, xerbla_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transa. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(dgemmt, xerbla_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'O', transb = 'N'; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transb. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(dgemmt, xerbla_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'O'; + char uplo = 'U'; + int expected_info = 3; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(dgemmt, xerbla_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 4; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(dgemmt, xerbla_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 5; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(dgemmt, xerbla_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 8; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(dgemmt, xerbla_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 10; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(dgemmt, xerbla_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 13; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. + * Test error function for an invalid param order. + * Must be column or row major. + */ +CTEST(dgemmt, xerbla_c_api_major_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 0; + + int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B transposed. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c new file mode 100644 index 000000000..811c356b3 --- /dev/null +++ b/utest/test_extensions/test_dimatcopy.c @@ -0,0 +1,915 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_DIMATCOPY { + double a_test[DATASIZE* DATASIZE]; + double a_verify[DATASIZE* DATASIZE]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DIMATCOPY data_dimatcopy; + +/** + * Comapare results computed by dimatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * return norm of difference between openblas and reference func + */ +static double check_dimatcopy(char api, char order, char trans, blasint rows, blasint cols, double alpha, + blasint lda_src, blasint lda_dst) +{ + blasint m, n; + blasint rows_out, cols_out; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + if (order == 'C') { + n = rows; m = cols; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + rows_out = n; cols_out = m; + } + else { + rows_out = m; cols_out = n; + } + + drand_generate(data_dimatcopy.a_test, lda_src*m); + + if (trans == 'T' || trans == 'C') { + dtranspose(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); + } + else { + dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); + } + + if (api == 'F') { + BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, + &lda_src, &lda_dst); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_dimatcopy(corder, ctrans, rows, cols, alpha, data_dimatcopy.a_test, + lda_src, lda_dst); + } + + // Find the differences between output matrix computed by dimatcopy and reference func + return dmatrix_difference(data_dimatcopy.a_test, data_dimatcopy.a_verify, cols_out, rows_out, lda_dst); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda_src, blasint lda_dst, int expected_info) +{ + double alpha = 1.0; + + set_xerbla("DIMATCOPY", expected_info); + + BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, + &lda_src, &lda_dst); + + return check_error(); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, colmajor_trans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, colmajor_trans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, colmajor_trans_col_50_row_100_alpha_one) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_50_row_100_alpha_one) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, colmajor_trans_col_50_row_100_alpha_zero) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_50_row_100_alpha_zero) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_50_alpha_one) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_50_alpha_one) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_50_alpha_zero) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_50_alpha_zero) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(dimatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(dimatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using row major layout, + * lda_src must be at least n. + */ +CTEST(dimatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using column major layout, + * lda_src must be at least m. + */ +CTEST(dimatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is no transposition, lda_dst must be at least n. + */ +CTEST(dimatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is transposition, lda_dst must be at least m. + */ +CTEST(dimatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is no transposition, lda_dst must be at least m. + */ +CTEST(dimatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is transposition, lda_dst must be at least n. + */ +CTEST(dimatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c new file mode 100644 index 000000000..e60b9c83d --- /dev/null +++ b/utest/test_extensions/test_domatcopy.c @@ -0,0 +1,640 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_DOMATCOPY { + double a_test[DATASIZE * DATASIZE]; + double b_test[DATASIZE * DATASIZE]; + double b_verify[DATASIZE * DATASIZE]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DOMATCOPY data_domatcopy; + +/** + * Comapare results computed by domatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param alpha - scaling factor for matrix B + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * return norm of difference between openblas and reference func + */ +static double check_domatcopy(char api, char order, char trans, blasint rows, blasint cols, double alpha, + blasint lda, blasint ldb) +{ + blasint b_rows, b_cols; + blasint m, n; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + if (order == 'C') { + m = cols; n = rows; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + b_rows = n; b_cols = m; + } + else { + b_rows = m; b_cols = n; + } + + drand_generate(data_domatcopy.a_test, lda*m); + + if (trans == 'T' || trans == 'C') { + dtranspose(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); + } + else { + dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); + } + + if (api == 'F') { + BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, + &lda, data_domatcopy.b_test, &ldb); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_domatcopy(corder, ctrans, rows, cols, alpha, data_domatcopy.a_test, + lda, data_domatcopy.b_test, ldb); + } + + return dmatrix_difference(data_domatcopy.b_test, data_domatcopy.b_verify, b_cols, b_rows, ldb); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda, blasint ldb, int expected_info) +{ + double alpha = 1.0; + + set_xerbla("DOMATCOPY", expected_info); + + BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, + &lda, data_domatcopy.b_test, &ldb); + + return check_error(); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(domatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific tests + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(domatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(domatcopy, colmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 0.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(domatcopy, colmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 50; + char order = 'C'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(domatcopy, rowmajor_conjtrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + double alpha = 2.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(domatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Matrix dimensions leave residues from 4 and 2 (specialize + * for rt case) + * alpha = 1.5 + */ +CTEST(domatcopy, rowmajor_trans_col_27_row_27) +{ + blasint m = 27, n = 27; + blasint lda = 27, ldb = 27; + char order = 'R'; + char trans = 'T'; + double alpha = 1.5; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(domatcopy, rowmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(domatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(domatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using row major layout, + * lda must be at least n. + */ +CTEST(domatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using column major layout, + * lda must be at least m. + */ +CTEST(domatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is no transposition, ldb must be at least n. + */ +CTEST(domatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is transposition, ldb must be at least m. + */ +CTEST(domatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is no transposition, ldb must be at least m. + */ +CTEST(domatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is transposition, ldb must be at least n. + */ +CTEST(domatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_drotmg.c b/utest/test_extensions/test_drotmg.c new file mode 100644 index 000000000..3073c8e3e --- /dev/null +++ b/utest/test_extensions/test_drotmg.c @@ -0,0 +1,414 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#ifdef BUILD_DOUBLE + +/** + * Fortran API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, y1_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 0.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0; + tr_d2 = 2.0; + tr_x1 = 8.0; + tr_y1 = 0.0; + + tr_param[0] = -2.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * Fortran API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, d1_negative) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = -1.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0; + tr_d2 = 0.0; + tr_x1 = 0.0; + tr_y1 = 8.0; + + tr_param[0] = -1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * Fortran API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, d1_positive_d2_positive_x1_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 0.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0; + tr_d2 = 2.0; + tr_x1 = 8.0; + tr_y1 = 8.0; + + tr_param[0] = 1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * Fortran API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, scaled_y_greater_than_scaled_x) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 1.0; + te_d2 = tr_d2 = -2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0; + tr_d2 = 0.0; + tr_x1 = 0.0; + tr_y1 = 8.0; + + tr_param[0] = -1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * C API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, c_api_y1_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 0.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0; + tr_d2 = 2.0; + tr_x1 = 8.0; + tr_y1 = 0.0; + + tr_param[0] = -2.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * C API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, c_api_d1_negative) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = -1.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0; + tr_d2 = 0.0; + tr_x1 = 0.0; + tr_y1 = 8.0; + + tr_param[0] = -1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * C API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, c_api_d1_positive_d2_positive_x1_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 0.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0; + tr_d2 = 2.0; + tr_x1 = 8.0; + tr_y1 = 8.0; + + tr_param[0] = 1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * C API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, c_api_scaled_y_greater_than_scaled_x) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 1.0; + te_d2 = tr_d2 = -2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0; + tr_d2 = 0.0; + tr_x1 = 0.0; + tr_y1 = 8.0; + + tr_param[0] = -1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dsum.c b/utest/test_extensions/test_dsum.c new file mode 100644 index 000000000..e987c5a42 --- /dev/null +++ b/utest/test_extensions/test_dsum.c @@ -0,0 +1,403 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_DOUBLE + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, -1.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, -1.5, 1.0, 1.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.3, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, -1.0, -3.0, 2.2, 3.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, -2.2, 3.3}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.2, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 2.0, 2.2, 2.7, -3.3, -5.9}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {0.0, 1.0, 2.2, 3.3, 0.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {0.0, 3.0, 1.0, -2.2, 2.2, -1.7, 3.3, 14.5, 0.0, -9.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, -1.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, -1.5, 1.0, 1.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(4.3, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, -1.0, -3.0, 2.2, 3.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, -2.2, 3.3}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(3.2, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 2.0, 2.2, 2.7, -3.3, -5.9}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {0.0, 1.0, 2.2, 3.3, 0.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {0.0, 3.0, 1.0, -2.2, 2.2, -1.7, 3.3, 14.5, 0.0, -9.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dzamax.c b/utest/test_extensions/test_dzamax.c new file mode 100644 index 000000000..7bc0200c9 --- /dev/null +++ b/utest/test_extensions/test_dzamax.c @@ -0,0 +1,293 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX16 + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, step_zero){ + blasint i; + blasint N = ELEMENTS * 2, inc = 0; + double x[ELEMENTS * 2]; + for (i = 0; i < N; i ++) { + x[i] = i - 1000; + } + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL((fabs(x[0]) + fabs(x[1])), amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.0, 2.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.0, -2.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -3.0, -1.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -3.0, -1.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i; + } + x[7 * inc * 2] = 1000.0; + x[7 * inc * 2 + 1] = 1000.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = -i; + } + x[7 * inc * 2] = 1000.0; + x[7 * inc * 2 + 1] = 1000.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i; + } + x[7 * inc * 2] = 1000.0; + x[7 * inc * 2 + 1] = 1000.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = -i; + } + x[7 * inc * 2] = 1000.0; + x[7 * inc * 2 + 1] = 1000.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dzamin.c b/utest/test_extensions/test_dzamin.c new file mode 100644 index 000000000..549881fdc --- /dev/null +++ b/utest/test_extensions/test_dzamin.c @@ -0,0 +1,310 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX16 + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, step_zero){ + blasint i; + blasint N = ELEMENTS * 2, inc = 0; + double x[ELEMENTS * 2]; + for (i = 0; i < N; i ++) { + x[i] = i - 1000; + } + x[8] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL((fabs(x[0]) + fabs(x[1])), amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.0, 2.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.0, -2.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dzsum.c b/utest/test_extensions/test_dzsum.c new file mode 100644 index 000000000..5139f59cb --- /dev/null +++ b/utest/test_extensions/test_dzsum.c @@ -0,0 +1,403 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX16 + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1, -1.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0, 2.3, -1.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, -1.0, 2.3, -1.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.4, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, -1.5, 1.1, -1.0, 1.0, 1.0, 1.1, -1.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.6, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2, 1.1, -1.0, 0.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.4, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, -1.0, 0.0, -1.0, -3.0, -1.0, 0.0, 2.2, 3.0, -1.0, 0.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, -2.2, 3.3, 1.1, 1.0, -2.2, 3.3}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.4, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.1, 1.0, 1.0, 2.0, 1.1, 1.0, 2.2, 2.7, 1.1, 1.0, -3.3, -5.9}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(-0.2, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {0.0, 1.0, 2.2, 3.3, 0.0, 0.0, 1.0, 2.2, 3.3, 0.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(13.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {0.0, 3.0, 1.0, 2.2, 1.0, -2.2, 1.0, 2.2, 2.2, -1.7, 1.0, 2.2, 3.3, 14.5, 1.0, 2.2, 0.0, -9.0, 1.0, 2.2}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(11.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1, -1.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0, 2.3, -1.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, -1.0, 2.3, -1.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.4, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, -1.5, 1.1, -1.0, 1.0, 1.0, 1.1, -1.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.6, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2, 1.1, -1.0, 0.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(4.4, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, -1.0, 0.0, -1.0, -3.0, -1.0, 0.0, 2.2, 3.0, -1.0, 0.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, -2.2, 3.3, 1.1, 1.0, -2.2, 3.3}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.4, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.1, 1.0, 1.0, 2.0, 1.1, 1.0, 2.2, 2.7, 1.1, 1.0, -3.3, -5.9}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(-0.2, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {0.0, 1.0, 2.2, 3.3, 0.0, 0.0, 1.0, 2.2, 3.3, 0.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(13.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {0.0, 3.0, 1.0, 2.2, 1.0, -2.2, 1.0, 2.2, 2.2, -1.7, 1.0, 2.2, 3.3, 14.5, 1.0, 2.2, 0.0, -9.0, 1.0, 2.2}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(11.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_icamin.c b/utest/test_extensions/test_icamin.c new file mode 100644 index 000000000..cca464eac --- /dev/null +++ b/utest/test_extensions/test_icamin.c @@ -0,0 +1,625 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS * 2]; + for (i = 0; i < N * 2; i ++) { + x[i] = i - 1000; + } + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.0f, 2.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.0f, -2.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc * 2] = 0.0f; + x[(N - 1) * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS * 2]; + for (i = 0; i < N * 2; i ++) { + x[i] = i - 1000; + } + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.0f, 2.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.0f, -2.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc * 2] = 0.0f; + x[(N - 1) * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c new file mode 100644 index 000000000..bebe76dba --- /dev/null +++ b/utest/test_extensions/test_idamin.c @@ -0,0 +1,789 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_DOUBLE + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.1}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.1, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, 1.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.1, 1.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.1, 1.0, -2.2}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, min_idx_in_vec_tail_inc_1){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double *x = (double*)malloc(ELEMENTS * inc * sizeof(double)); + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + free(x); + ASSERT_EQUAL(N, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.1}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.1, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, 1.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.1, 1.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.1, 1.0, -2.2}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double *x = (double*) malloc(ELEMENTS * inc * sizeof(double)); + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0; + blasint index = cblas_idamin(N, x, inc); + free(x); + ASSERT_EQUAL(N - 1, index); +} +#endif diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c new file mode 100644 index 000000000..d93813e6f --- /dev/null +++ b/utest/test_extensions/test_isamin.c @@ -0,0 +1,789 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_SINGLE + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.1f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.1f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, 1.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.1f, 1.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, min_idx_in_vec_tail_inc_1){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float *x = (float*) malloc(ELEMENTS * inc * sizeof(float)); + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + free(x); + ASSERT_EQUAL(N, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.1f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.1f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, 1.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.1f, 1.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float *x = (float*)malloc(ELEMENTS * inc * sizeof(float)); + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + free(x); + ASSERT_EQUAL(N - 1, index); +} +#endif diff --git a/utest/test_extensions/test_izamin.c b/utest/test_extensions/test_izamin.c new file mode 100644 index 000000000..a0bdae8e2 --- /dev/null +++ b/utest/test_extensions/test_izamin.c @@ -0,0 +1,625 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX16 + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS * 2]; + for (i = 0; i < N * 2; i ++) { + x[i] = i - 1000; + } + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.0, 2.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.0, -2.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc * 2] = 0.0; + x[(N - 1) * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS * 2]; + for (i = 0; i < N * 2; i ++) { + x[i] = i - 1000; + } + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.0, 2.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.0, -2.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc * 2] = 0.0; + x[(N - 1) * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} +#endif diff --git a/utest/test_extensions/test_samin.c b/utest/test_extensions/test_samin.c new file mode 100644 index 000000000..fd34d462a --- /dev/null +++ b/utest/test_extensions/test_samin.c @@ -0,0 +1,354 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_SINGLE + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N; i ++) { + x[i] = i - 1000; + } + x[8] = 0.0; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(fabsf(x[0]), amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.1f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.1f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, 1.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.1f, 1.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 1.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_saxpby.c b/utest/test_extensions/test_saxpby.c new file mode 100644 index 000000000..b4bd5cf0b --- /dev/null +++ b/utest/test_extensions/test_saxpby.c @@ -0,0 +1,794 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_SAXPBY { + float x_test[DATASIZE * INCREMENT]; + float x_verify[DATASIZE * INCREMENT]; + float y_test[DATASIZE * INCREMENT]; + float y_verify[DATASIZE * INCREMENT]; +}; +#ifdef BUILD_SINGLE +static struct DATA_SAXPBY data_saxpby; + +/** + * Fortran API specific function + * Test saxpby by comparing it with sscal and saxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static float check_saxpby(blasint n, float alpha, blasint incx, float beta, blasint incy) +{ + blasint i; + + // sscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + srand_generate(data_saxpby.x_test, n * incx_abs); + srand_generate(data_saxpby.y_test, n * incy_abs); + + // Copy vector x for saxpy + for (i = 0; i < n * incx_abs; i++) + data_saxpby.x_verify[i] = data_saxpby.x_test[i]; + + // Copy vector y for sscal + for (i = 0; i < n * incy_abs; i++) + data_saxpby.y_verify[i] = data_saxpby.y_test[i]; + + // Find beta*y + BLASFUNC(sscal)(&n, &beta, data_saxpby.y_verify, &incy_abs); + + // Find sum of alpha*x and beta*y + BLASFUNC(saxpy)(&n, &alpha, data_saxpby.x_verify, &incx, + data_saxpby.y_verify, &incy); + + BLASFUNC(saxpby)(&n, &alpha, data_saxpby.x_test, &incx, + &beta, data_saxpby.y_test, &incy); + + // Find the differences between output vector caculated by saxpby and saxpy + for (i = 0; i < n * incy_abs; i++) + data_saxpby.y_test[i] -= data_saxpby.y_verify[i]; + + // Find the norm of differences + return BLASFUNC(snrm2)(&n, data_saxpby.y_test, &incy_abs); +} + +/** + * C API specific function + * Test saxpby by comparing it with sscal and saxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static float c_api_check_saxpby(blasint n, float alpha, blasint incx, float beta, blasint incy) +{ + blasint i; + + // sscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Copy vector x for saxpy + for (i = 0; i < n * incx_abs; i++) + data_saxpby.x_verify[i] = data_saxpby.x_test[i]; + + // Copy vector y for sscal + for (i = 0; i < n * incy_abs; i++) + data_saxpby.y_verify[i] = data_saxpby.y_test[i]; + + // Find beta*y + cblas_sscal(n, beta, data_saxpby.y_verify, incy_abs); + + // Find sum of alpha*x and beta*y + cblas_saxpy(n, alpha, data_saxpby.x_verify, incx, + data_saxpby.y_verify, incy); + + cblas_saxpby(n, alpha, data_saxpby.x_test, incx, + beta, data_saxpby.y_test, incy); + + // Find the differences between output vector caculated by saxpby and saxpy + for (i = 0; i < n * incy_abs; i++) + data_saxpby.y_test[i] -= data_saxpby.y_verify[i]; + + // Find the norm of differences + return cblas_snrm2(n, data_saxpby.y_test, incy_abs); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(saxpby, inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(saxpby, inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(saxpby, inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(saxpby, inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha = 3.0f; + float beta = 4.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(saxpby, inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + float alpha = 5.0f; + float beta = 4.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(saxpby, inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + float alpha = 1.0f; + float beta = 6.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(saxpby, inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + float alpha = 7.0f; + float beta = 3.5f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(saxpby, inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 0.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero +*/ +CTEST(saxpby, inc_x_1_inc_y_2_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 0.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(saxpby, inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * Scalar beta is zero +*/ +CTEST(saxpby, inc_x_2_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(saxpby, inc_x_1_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(saxpby, inc_x_2_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(saxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 0.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(saxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 0.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(saxpby, check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(saxpby, c_api_inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(saxpby, c_api_inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha = 3.0f; + float beta = 4.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(saxpby, c_api_inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + float alpha = 5.0f; + float beta = 4.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(saxpby, c_api_inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + float alpha = 1.0f; + float beta = 6.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(saxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + float alpha = 7.0f; + float beta = 3.5f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 0.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero +*/ +CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 0.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * Scalar beta is zero +*/ +CTEST(saxpby, c_api_inc_x_2_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(saxpby, c_api_inc_x_2_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 0.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 0.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(saxpby, c_api_check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_scamax.c b/utest/test_extensions/test_scamax.c new file mode 100644 index 000000000..8c214ddff --- /dev/null +++ b/utest/test_extensions/test_scamax.c @@ -0,0 +1,293 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, step_zero){ + blasint i; + blasint N = ELEMENTS * 2, inc = 0; + float x[ELEMENTS * 2]; + for (i = 0; i < N; i ++) { + x[i] = i - 1000; + } + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL((fabsf(x[0]) + fabsf(x[1])), amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.0f, 2.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.0f, -2.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -3.0f, -1.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 3.0f, 1.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -3.0f, -1.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i; + } + x[7 * inc * 2] = 1000.0f; + x[7 * inc * 2 + 1] = 1000.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = -i; + } + x[7 * inc * 2] = 1000.0f; + x[7 * inc * 2 + 1] = 1000.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i; + } + x[7 * inc * 2] = 1000.0f; + x[7 * inc * 2 + 1] = 1000.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = -i; + } + x[7 * inc * 2] = 1000.0f; + x[7 * inc * 2 + 1] = 1000.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_scamin.c b/utest/test_extensions/test_scamin.c new file mode 100644 index 000000000..507548f2a --- /dev/null +++ b/utest/test_extensions/test_scamin.c @@ -0,0 +1,310 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, step_zero){ + blasint i; + blasint N = ELEMENTS * 2, inc = 0; + float x[ELEMENTS * 2]; + for (i = 0; i < N; i ++) { + x[i] = i - 1000; + } + x[8] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL((fabsf(x[0]) + fabsf(x[1])), amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.0f, 2.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.0f, -2.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_scsum.c b/utest/test_extensions/test_scsum.c new file mode 100644 index 000000000..492e1a4ca --- /dev/null +++ b/utest/test_extensions/test_scsum.c @@ -0,0 +1,403 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f, -1.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f, 2.3f, -1.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, -1.0f, 2.3f, -1.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.4f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, -1.5f, 1.1f, -1.0f, 1.0f, 1.0f, 1.1f, -1.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.6f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 1.1f, -1.0f, 0.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.4f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, -1.0f, 0.0f, -1.0f, -3.0f, -1.0f, 0.0f, 2.2f, 3.0f, -1.0f, 0.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, -2.2f, 3.3f, 1.1f, 1.0f, -2.2f, 3.3f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.4f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.1f, 1.0f, 1.0f, 2.0f, 1.1f, 1.0f, 2.2f, 2.7f, 1.1f, 1.0f, -3.3f, -5.9f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(-0.2f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f, 0.0f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(13.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {0.0f, 3.0f, 1.0f, 2.2f, 1.0f, -2.2f, 1.0f, 2.2f, 2.2f, -1.7f, 1.0f, 2.2f, 3.3f, 14.5f, 1.0f, 2.2f, 0.0f, -9.0f, 1.0f, 2.2f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(11.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f, -1.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f, 2.3f, -1.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, -1.0f, 2.3f, -1.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.4f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, -1.5f, 1.1f, -1.0f, 1.0f, 1.0f, 1.1f, -1.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.6f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 1.1f, -1.0f, 0.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(4.4f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, -1.0f, 0.0f, -1.0f, -3.0f, -1.0f, 0.0f, 2.2f, 3.0f, -1.0f, 0.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, -2.2f, 3.3f, 1.1f, 1.0f, -2.2f, 3.3f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.4f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.1f, 1.0f, 1.0f, 2.0f, 1.1f, 1.0f, 2.2f, 2.7f, 1.1f, 1.0f, -3.3f, -5.9f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(-0.2f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f, 0.0f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(13.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {0.0f, 3.0f, 1.0f, 2.2f, 1.0f, -2.2f, 1.0f, 2.2f, 2.2f, -1.7f, 1.0f, 2.2f, 3.3f, 14.5f, 1.0f, 2.2f, 0.0f, -9.0f, 1.0f, 2.2f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(11.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_sgeadd.c b/utest/test_extensions/test_sgeadd.c new file mode 100644 index 000000000..171132b9d --- /dev/null +++ b/utest/test_extensions/test_sgeadd.c @@ -0,0 +1,880 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 + +struct DATA_SGEADD +{ + float a_test[M * N]; + float c_test[M * N]; + float c_verify[M * N]; +}; + +#ifdef BUILD_SINGLE +static struct DATA_SGEADD data_sgeadd; + +/** + * sgeadd reference implementation + * + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param aptr - refer to matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param cptr - refer to matrix C + * param ldc - leading dimension of C + */ +static void sgeadd_trusted(blasint m, blasint n, float alpha, float *aptr, + blasint lda, float beta, float *cptr, blasint ldc) +{ + blasint i; + + for (i = 0; i < n; i++) + { + cblas_saxpby(m, alpha, aptr, 1, beta, cptr, 1); + aptr += lda; + cptr += ldc; + } +} + +/** + * Test sgeadd by comparing it against reference + * Compare with the following options: + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static float check_sgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, float alpha, blasint lda, + float beta, blasint ldc) +{ + blasint i; + blasint cols = m, rows = n; + + if (order == CblasRowMajor) + { + rows = m; + cols = n; + } + + // Fill matrix A, C + srand_generate(data_sgeadd.a_test, lda * rows); + srand_generate(data_sgeadd.c_test, ldc * rows); + + // Copy matrix C for sgeadd + for (i = 0; i < ldc * rows; i++) + data_sgeadd.c_verify[i] = data_sgeadd.c_test[i]; + + sgeadd_trusted(cols, rows, alpha, data_sgeadd.a_test, lda, + beta, data_sgeadd.c_verify, ldc); + + if (api == 'F') + BLASFUNC(sgeadd) + (&m, &n, &alpha, data_sgeadd.a_test, &lda, + &beta, data_sgeadd.c_test, &ldc); + else + cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda, + beta, data_sgeadd.c_test, ldc); + + // Find the differences between output matrix caculated by sgeadd and sgemm + return smatrix_difference(data_sgeadd.c_test, data_sgeadd.c_verify, cols, rows, ldc); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param lda - leading dimension of A + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in sgeadd + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, blasint lda, + blasint ldc, int expected_info) +{ + float alpha = 1.0f; + float beta = 1.0f; + + set_xerbla("SGEADD ", expected_info); + + if (api == 'F') + BLASFUNC(sgeadd) + (&m, &n, &alpha, data_sgeadd.a_test, &lda, + &beta, data_sgeadd.c_test, &ldc); + else + cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda, + beta, data_sgeadd.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(sgeadd, matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 3.0f; + float beta = 3.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(sgeadd, matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 0.0f; + float beta = 2.5f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(sgeadd, matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 3.0f; + float beta = 0.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(sgeadd, matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 0.0f; + float beta = 0.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(sgeadd, matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n - + * number of columns of A and C + * Must be at least zero. + */ +CTEST(sgeadd, xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = m; + blasint ldc = m; + + int expected_info = 2; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + */ +CTEST(sgeadd, xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + */ +CTEST(sgeadd, xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + */ +CTEST(sgeadd, xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Check if n - number of columns of A, C equal zero. + */ +CTEST(sgeadd, n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Check if m - number of rows of A and C equal zero. + */ +CTEST(sgeadd, m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 2.0f; + float beta = 3.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 4.0f; + float beta = 2.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(sgeadd, c_api_matrix_n_50_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N / 2; + blasint m = M; + + blasint lda = n; + blasint ldc = n; + + float alpha = 3.0f; + float beta = 1.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 0.0f; + float beta = 1.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 3.0f; + float beta = 0.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 0.0f; + float beta = 0.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(sgeadd, c_api_matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + float alpha = 3.0f; + float beta = 4.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test error function for an invalid param order - + * specifies whether A and C stored in + * row-major order or column-major order + */ +CTEST(sgeadd, c_api_xerbla_invalid_order) +{ + CBLAS_ORDER order = INVALID; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 0; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(sgeadd, c_api_xerbla_n_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(sgeadd, c_api_xerbla_m_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(sgeadd, c_api_xerbla_lda_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(sgeadd, c_api_xerbla_ldc_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Check if n - number of columns of A, C equal zero. + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Check if m - number of rows of A and C equal zero. + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_sgemmt.c b/utest/test_extensions/test_sgemmt.c new file mode 100644 index 000000000..5b51e3579 --- /dev/null +++ b/utest/test_extensions/test_sgemmt.c @@ -0,0 +1,1442 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_SGEMMT { + float a_test[DATASIZE * DATASIZE]; + float b_test[DATASIZE * DATASIZE]; + float c_test[DATASIZE * DATASIZE]; + float c_verify[DATASIZE * DATASIZE]; + float c_gemm[DATASIZE * DATASIZE]; +}; + +#ifdef BUILD_SINGLE +static struct DATA_SGEMMT data_sgemmt; + +/** + * Compute gemmt via gemm since gemmt is gemm but updates only + * the upper or lower triangular part of the result matrix + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + */ +static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, float alpha, blasint lda, + blasint ldb, float beta, blasint ldc) +{ + blasint i, j; + + if(api == 'F') + BLASFUNC(sgemm)(&transa, &transb, &m, &m, &k, &alpha, data_sgemmt.a_test, &lda, + data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_gemm, &ldc); + else + cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda, + data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc); + + if (uplo == 'L' || uplo == CblasLower) + { + for (i = 0; i < m; i++) + for (j = i; j < m; j++) + data_sgemmt.c_verify[i * ldc + j] = + data_sgemmt.c_gemm[i * ldc + j]; + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i; j++) + data_sgemmt.c_verify[i * ldc + j] = + data_sgemmt.c_gemm[i * ldc + j]; + } +} + +/** + * Comapare results computed by sgemmt and sgemmt_trusted + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static float check_sgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, float alpha, blasint lda, + blasint ldb, float beta, blasint ldc) +{ + blasint i; + blasint b_cols; + blasint a_cols; + blasint inc = 1; + blasint size_c = m * ldc; + + if(order == CblasColMajor){ + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = m; + else a_cols = k; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = k; + else b_cols = m; + } else { + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = k; + else a_cols = m; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = m; + else b_cols = k; + } + + srand_generate(data_sgemmt.a_test, a_cols * lda); + srand_generate(data_sgemmt.b_test, b_cols * ldb); + srand_generate(data_sgemmt.c_test, m * ldc); + + for (i = 0; i < m * ldc; i++) + data_sgemmt.c_gemm[i] = data_sgemmt.c_verify[i] = data_sgemmt.c_test[i]; + + sgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc); + + if (api == 'F') + BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test, + &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc); + else + cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda, + data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc); + + for (i = 0; i < m * ldc; i++) + data_sgemmt.c_verify[i] -= data_sgemmt.c_test[i]; + + return BLASFUNC(snrm2)(&size_c, data_sgemmt.c_verify, &inc) / size_c; +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in sgemmt + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, blasint lda, blasint ldb, + blasint ldc, int expected_info) +{ + float alpha = 1.0f; + float beta = 0.0f; + + set_xerbla("SGEMMT ", expected_info); + + if (api == 'F') + BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test, + &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc); + else + cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda, + data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'T'; + char uplo = 'U'; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'U'; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(sgemmt, upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + char transa = 'T', transb = 'N'; + char uplo = 'L'; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'T'; + char uplo = 'L'; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'L'; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(sgemmt, lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, c_api_colmajor_upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, c_api_colmajor_upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, c_api_colmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(sgemmt, c_api_colmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, c_api_colmajor_lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, c_api_colmajor_lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, c_api_colmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(sgemmt, c_api_colmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, c_api_rowmajor_upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 100; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 100, ldb = 100, ldc = 50; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, c_api_rowmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(sgemmt, c_api_rowmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, c_api_rowmajor_lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 100; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 100, ldb = 100, ldc = 50; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, c_api_rowmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(sgemmt, c_api_rowmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param uplo. + * Must be upper (U) or lower (L). + */ +CTEST(sgemmt, xerbla_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transa. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(sgemmt, xerbla_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'O', transb = 'N'; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transb. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(sgemmt, xerbla_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'O'; + char uplo = 'U'; + int expected_info = 3; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(sgemmt, xerbla_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 4; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(sgemmt, xerbla_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 5; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(sgemmt, xerbla_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 8; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(sgemmt, xerbla_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 10; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(sgemmt, xerbla_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 13; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. + * Test error function for an invalid param order. + * Must be column or row major. + */ +CTEST(sgemmt, xerbla_c_api_major_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 0; + + int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B transposed. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c new file mode 100644 index 000000000..ba388596d --- /dev/null +++ b/utest/test_extensions/test_simatcopy.c @@ -0,0 +1,915 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_SIMATCOPY { + float a_test[DATASIZE* DATASIZE]; + float a_verify[DATASIZE* DATASIZE]; +}; + +#ifdef BUILD_SINGLE +static struct DATA_SIMATCOPY data_simatcopy; + +/** + * Comapare results computed by simatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * return norm of difference between openblas and reference func + */ +static float check_simatcopy(char api, char order, char trans, blasint rows, blasint cols, float alpha, + blasint lda_src, blasint lda_dst) +{ + blasint m, n; + blasint rows_out, cols_out; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + if (order == 'C') { + n = rows; m = cols; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + rows_out = n; cols_out = m; + } + else { + rows_out = m; cols_out = n; + } + + srand_generate(data_simatcopy.a_test, lda_src*m); + + if (trans == 'T' || trans == 'C') { + stranspose(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); + } + else { + scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); + } + + if (api == 'F') { + BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, + &lda_src, &lda_dst); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_simatcopy(corder, ctrans, rows, cols, alpha, data_simatcopy.a_test, + lda_src, lda_dst); + } + + // Find the differences between output matrix computed by simatcopy and reference func + return smatrix_difference(data_simatcopy.a_test, data_simatcopy.a_verify, cols_out, rows_out, lda_dst); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda_src, blasint lda_dst, int expected_info) +{ + float alpha = 1.0f; + + set_xerbla("SIMATCOPY", expected_info); + + BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, + &lda_src, &lda_dst); + + return check_error(); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0f + */ +CTEST(simatcopy, colmajor_trans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0f + */ +CTEST(simatcopy, colmajor_notrans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 0.0f + */ +CTEST(simatcopy, colmajor_trans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 0.0f + */ +CTEST(simatcopy, colmajor_notrans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 1.0f + */ +CTEST(simatcopy, colmajor_trans_col_50_row_100_alpha_one) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 1.0f + */ +CTEST(simatcopy, colmajor_notrans_col_50_row_100_alpha_one) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 0.0f + */ +CTEST(simatcopy, colmajor_trans_col_50_row_100_alpha_zero) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 0.0f + */ +CTEST(simatcopy, colmajor_notrans_col_50_row_100_alpha_zero) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 2.0f + */ +CTEST(simatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 2.0f + */ +CTEST(simatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 0.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 0.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific tests + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 1.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_50_alpha_one) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 1.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_50_alpha_one) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 0.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_50_alpha_zero) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 0.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_50_alpha_zero) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 2.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 2.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(simatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(simatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using row major layout, + * lda_src must be at least n. + */ +CTEST(simatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using column major layout, + * lda_src must be at least m. + */ +CTEST(simatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is no transposition, lda_dst must be at least n. + */ +CTEST(simatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is transposition, lda_dst must be at least m. + */ +CTEST(simatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is no transposition, lda_dst must be at least m. + */ +CTEST(simatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is transposition, lda_dst must be at least n. + */ +CTEST(simatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c new file mode 100644 index 000000000..b53c7cae5 --- /dev/null +++ b/utest/test_extensions/test_somatcopy.c @@ -0,0 +1,640 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_SOMATCOPY { + float a_test[DATASIZE * DATASIZE]; + float b_test[DATASIZE * DATASIZE]; + float b_verify[DATASIZE * DATASIZE]; +}; + +#ifdef BUILD_SINGLE +static struct DATA_SOMATCOPY data_somatcopy; + +/** + * Comapare results computed by somatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param alpha - scaling factor for matrix B + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * return norm of difference between openblas and reference func + */ +static float check_somatcopy(char api, char order, char trans, blasint rows, blasint cols, float alpha, + blasint lda, blasint ldb) +{ + blasint b_rows, b_cols; + blasint m, n; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + if (order == 'C') { + m = cols; n = rows; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + b_rows = n; b_cols = m; + } + else { + b_rows = m; b_cols = n; + } + + srand_generate(data_somatcopy.a_test, lda*m); + + if (trans == 'T' || trans == 'C') { + stranspose(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); + } + else { + scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); + } + + if (api == 'F') { + BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, + &lda, data_somatcopy.b_test, &ldb); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_somatcopy(corder, ctrans, rows, cols, alpha, data_somatcopy.a_test, + lda, data_somatcopy.b_test, ldb); + } + + return smatrix_difference(data_somatcopy.b_test, data_somatcopy.b_verify, b_cols, b_rows, ldb); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda, blasint ldb, int expected_info) +{ + float alpha = 1.0; + + set_xerbla("SOMATCOPY", expected_info); + + BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, + &lda, data_somatcopy.b_test, &ldb); + + return check_error(); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(somatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(somatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(somatcopy, colmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 0.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(somatcopy, colmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 50; + char order = 'C'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(somatcopy, rowmajor_conjtrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + float alpha = 2.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(somatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Matrix dimensions leave residues from 4 and 2 (specialize + * for rt case) + * alpha = 1.5 + */ +CTEST(somatcopy, rowmajor_trans_col_27_row_27) +{ + blasint m = 27, n = 27; + blasint lda = 27, ldb = 27; + char order = 'R'; + char trans = 'T'; + float alpha = 1.5f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(somatcopy, rowmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(somatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(somatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using row major layout, + * lda must be at least n. + */ +CTEST(somatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using column major layout, + * lda must be at least m. + */ +CTEST(somatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is no transposition, ldb must be at least n. + */ +CTEST(somatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is transposition, ldb must be at least m. + */ +CTEST(somatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is no transposition, ldb must be at least m. + */ +CTEST(somatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is transposition, ldb must be at least n. + */ +CTEST(somatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_srotmg.c b/utest/test_extensions/test_srotmg.c new file mode 100644 index 000000000..3c97e3b4d --- /dev/null +++ b/utest/test_extensions/test_srotmg.c @@ -0,0 +1,414 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#ifdef BUILD_SINGLE + +/** + * Fortran API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, y1_zero) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 0.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0f; + tr_d2 = 2.0f; + tr_x1 = 8.0f; + tr_y1 = 0.0f; + + tr_param[0] = -2.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * Fortran API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, d1_negative) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = -1.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0f; + tr_d2 = 0.0f; + tr_x1 = 0.0f; + tr_y1 = 8.0f; + + tr_param[0] = -1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * Fortran API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, d1_positive_d2_positive_x1_zero) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 0.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0f; + tr_d2 = 2.0f; + tr_x1 = 8.0f; + tr_y1 = 8.0f; + + tr_param[0] = 1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * Fortran API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, scaled_y_greater_than_scaled_x) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 1.0f; + te_d2 = tr_d2 = -2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0f; + tr_d2 = 0.0f; + tr_x1 = 0.0f; + tr_y1 = 8.0f; + + tr_param[0] = -1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * C API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, c_api_y1_zero) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 0.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0f; + tr_d2 = 2.0f; + tr_x1 = 8.0f; + tr_y1 = 0.0f; + + tr_param[0] = -2.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * C API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, c_api_d1_negative) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = -1.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0f; + tr_d2 = 0.0f; + tr_x1 = 0.0f; + tr_y1 = 8.0f; + + tr_param[0] = -1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * C API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, c_api_d1_positive_d2_positive_x1_zero) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 0.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0f; + tr_d2 = 2.0f; + tr_x1 = 8.0f; + tr_y1 = 8.0f; + + tr_param[0] = 1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * C API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, c_api_scaled_y_greater_than_scaled_x) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 1.0f; + te_d2 = tr_d2 = -2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0f; + tr_d2 = 0.0f; + tr_x1 = 0.0f; + tr_y1 = 8.0f; + + tr_param[0] = -1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_ssum.c b/utest/test_extensions/test_ssum.c new file mode 100644 index 000000000..971a0d2e0 --- /dev/null +++ b/utest/test_extensions/test_ssum.c @@ -0,0 +1,403 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_SINGLE + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, -1.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, -1.5f, 1.0f, 1.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.3f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, -1.0f, -3.0f, 2.2f, 3.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, -2.2f, 3.3f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.2f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 2.0f, 2.2f, 2.7f, -3.3f, -5.9f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {0.0f, 3.0f, 1.0f, -2.2f, 2.2f, -1.7f, 3.3f, 14.5f, 0.0f, -9.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, -1.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, -1.5f, 1.0f, 1.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(4.3f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, -1.0f, -3.0f, 2.2f, 3.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, -2.2f, 3.3f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(3.2f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 2.0f, 2.2f, 2.7f, -3.3f, -5.9f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {0.0f, 3.0f, 1.0f, -2.2f, 2.2f, -1.7f, 3.3f, 14.5f, 0.0f, -9.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zaxpby.c b/utest/test_extensions/test_zaxpby.c new file mode 100644 index 000000000..6148f44c5 --- /dev/null +++ b/utest/test_extensions/test_zaxpby.c @@ -0,0 +1,630 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZAXPBY { + double x_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; + double y_test[DATASIZE * INCREMENT * 2]; + double y_verify[DATASIZE * INCREMENT * 2]; +}; +#ifdef BUILD_COMPLEX16 +static struct DATA_ZAXPBY data_zaxpby; + +/** + * Fortran API specific function + * Test zaxpby by comparing it with zscal and zaxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static double check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, blasint incy) +{ + blasint i; + + // zscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + drand_generate(data_zaxpby.x_test, n * incx_abs * 2); + drand_generate(data_zaxpby.y_test, n * incy_abs * 2); + + // Copy vector x for zaxpy + for (i = 0; i < n * incx_abs * 2; i++) + data_zaxpby.x_verify[i] = data_zaxpby.x_test[i]; + + // Copy vector y for zscal + for (i = 0; i < n * incy_abs * 2; i++) + data_zaxpby.y_verify[i] = data_zaxpby.y_test[i]; + + // Find beta*y + BLASFUNC(zscal)(&n, beta, data_zaxpby.y_verify, &incy_abs); + + // Find sum of alpha*x and beta*y + BLASFUNC(zaxpy)(&n, alpha, data_zaxpby.x_verify, &incx, + data_zaxpby.y_verify, &incy); + + BLASFUNC(zaxpby)(&n, alpha, data_zaxpby.x_test, &incx, + beta, data_zaxpby.y_test, &incy); + + // Find the differences between output vector caculated by zaxpby and zaxpy + for (i = 0; i < n * incy_abs * 2; i++) + data_zaxpby.y_test[i] -= data_zaxpby.y_verify[i]; + + // Find the norm of differences + return BLASFUNC(dznrm2)(&n, data_zaxpby.y_test, &incy_abs); +} + +/** + * C API specific function + * Test zaxpby by comparing it with zscal and zaxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static double c_api_check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, blasint incy) +{ + blasint i; + + // zscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + drand_generate(data_zaxpby.x_test, n * incx_abs * 2); + drand_generate(data_zaxpby.y_test, n * incy_abs * 2); + + // Copy vector x for zaxpy + for (i = 0; i < n * incx_abs * 2; i++) + data_zaxpby.x_verify[i] = data_zaxpby.x_test[i]; + + // Copy vector y for zscal + for (i = 0; i < n * incy_abs * 2; i++) + data_zaxpby.y_verify[i] = data_zaxpby.y_test[i]; + + // Find beta*y + cblas_zscal(n, beta, data_zaxpby.y_verify, incy_abs); + + // Find sum of alpha*x and beta*y + cblas_zaxpy(n, alpha, data_zaxpby.x_verify, incx, + data_zaxpby.y_verify, incy); + + cblas_zaxpby(n, alpha, data_zaxpby.x_test, incx, + beta, data_zaxpby.y_test, incy); + + // Find the differences between output vector caculated by zaxpby and zaxpy + for (i = 0; i < n * incy_abs * 2; i++) + data_zaxpby.y_test[i] -= data_zaxpby.y_verify[i]; + + // Find the norm of differences + return cblas_dznrm2(n, data_zaxpby.y_test, incy_abs); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(zaxpby, inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(zaxpby, inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(zaxpby, inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(zaxpby, inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha[] = {3.0, 1.0}; + double beta[] = {4.0, 3.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(zaxpby, inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + double alpha[] = {5.0, 2.2}; + double beta[] = {4.0, 5.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(zaxpby, inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + double alpha[] = {1.0, 1.0}; + double beta[] = {6.0, 3.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(zaxpby, inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + double alpha[] = {7.0, 2.0}; + double beta[] = {3.5, 1.3}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(zaxpby, inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {0.0, 0.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(zaxpby, inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(zaxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(zaxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(zaxpby, check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(zaxpby, c_api_inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 2.1}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(zaxpby, c_api_inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha[] = {3.0, 2.0}; + double beta[] = {4.0, 3.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(zaxpby, c_api_inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + double alpha[] = {5.0, 2.0}; + double beta[] = {4.0, 3.1}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(zaxpby, c_api_inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + double alpha[] = {1.0, 1.0}; + double beta[] = {6.0, 2.3}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(zaxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + double alpha[] = {7.0, 1.0}; + double beta[] = {3.5, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {0.0, 0.0}; + double beta[] = {1.0, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(zaxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(zaxpby, c_api_check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif diff --git a/utest/test_extensions/test_zaxpyc.c b/utest/test_extensions/test_zaxpyc.c new file mode 100644 index 000000000..7c11cd920 --- /dev/null +++ b/utest/test_extensions/test_zaxpyc.c @@ -0,0 +1,159 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZAXPYC { + double x_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; + double y_test[DATASIZE * INCREMENT * 2]; + double y_verify[DATASIZE * INCREMENT * 2]; +}; +#ifdef BUILD_COMPLEX16 +static struct DATA_ZAXPYC data_zaxpyc; + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param incy - increment for the elements of y + * return norm of difference + */ +static double check_zaxpyc(blasint n, double *alpha, blasint incx, blasint incy) +{ + blasint i; + + drand_generate(data_zaxpyc.x_test, n * incx * 2); + drand_generate(data_zaxpyc.y_test, n * incy * 2); + + for (i = 0; i < n * incx * 2; i++) + data_zaxpyc.x_verify[i] = data_zaxpyc.x_test[i]; + + for (i = 0; i < n * incy * 2; i++) + data_zaxpyc.y_verify[i] = data_zaxpyc.y_test[i]; + + zconjugate_vector(n, incx, data_zaxpyc.x_verify); + + BLASFUNC(zaxpy) + (&n, alpha, data_zaxpyc.x_verify, &incx, + data_zaxpyc.y_verify, &incy); + + BLASFUNC(zaxpyc) + (&n, alpha, data_zaxpyc.x_test, &incx, + data_zaxpyc.y_test, &incy); + + for (i = 0; i < n * incy * 2; i++) + data_zaxpyc.y_verify[i] -= data_zaxpyc.y_test[i]; + + return BLASFUNC(dznrm2)(&n, data_zaxpyc.y_verify, &incy); +} + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(zaxpyc, conj_strides_one) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {5.0, 2.2}; + + double norm = check_zaxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(zaxpyc, conj_incx_one) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {5.0, 2.2}; + + double norm = check_zaxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(zaxpyc, conj_incy_one) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha[] = {5.0, 2.2}; + + double norm = check_zaxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(zaxpyc, conj_strides_two) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha[] = {5.0, 2.2}; + + double norm = check_zaxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif diff --git a/utest/test_extensions/test_zgbmv.c b/utest/test_extensions/test_zgbmv.c new file mode 100644 index 000000000..55473361c --- /dev/null +++ b/utest/test_extensions/test_zgbmv.c @@ -0,0 +1,280 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 1 + +struct DATA_ZGBMV { + double a_test[DATASIZE * DATASIZE * 2]; + double a_band_storage[DATASIZE * DATASIZE * 2]; + double matrix[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * 2 * INCREMENT]; + double c_test[DATASIZE * 2 * INCREMENT]; + double c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX16 + +static struct DATA_ZGBMV data_zgbmv; + +/** + * Transform full-storage band matrix A to band-packed storage mode. + * + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * output param a - buffer for holding band-packed matrix + * param lda - specifies the leading dimension of a + * param matrix - buffer holding full-storage band matrix A + * param ldm - specifies the leading full-storage band matrix A + */ +static void transform_to_band_storage(blasint m, blasint n, blasint kl, + blasint ku, double* a, blasint lda, + double* matrix, blasint ldm) +{ + blasint i, j, k; + for (j = 0; j < n; j++) + { + k = 2 * (ku - j); + for (i = MAX(0, 2*(j - ku)); i < MIN(m, j + kl + 1) * 2; i+=2) + { + a[(k + i) + j * lda * 2] = matrix[i + j * ldm * 2]; + a[(k + i) + j * lda * 2 + 1] = matrix[i + j * ldm * 2 + 1]; + } + } +} + +/** + * Generate full-storage band matrix A with kl sub-diagonals and ku super-diagonals + * + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * output param band_matrix - buffer for full-storage band matrix. + * param matrix - buffer holding input general matrix + * param ldm - specifies the leading of input general matrix + */ +static void get_band_matrix(blasint m, blasint n, blasint kl, blasint ku, + double *band_matrix, double *matrix, blasint ldm) +{ + blasint i, j; + blasint k = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < m * 2; j += 2) + { + if ((blasint)(j/2) > kl + i || i > ku + (blasint)(j/2)) + { + band_matrix[i * ldm * 2 + j] = 0.0; + band_matrix[i * ldm * 2 + j + 1] = 0.0; + continue; + } + + band_matrix[i * ldm * 2 + j] = matrix[k++]; + band_matrix[i * ldm * 2 + j + 1] = matrix[k++]; + } + } +} + +/** + * Comapare results computed by zgbmv and zgemv + * since gbmv is gemv for band matrix + * + * param trans specifies op(A), the transposition operation applied to A + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * param alpha - scaling factor for the matrix-vector product + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static double check_zgbmv(char trans, blasint m, blasint n, blasint kl, blasint ku, + double *alpha, blasint lda, blasint inc_b, double *beta, blasint inc_c) +{ + blasint i; + blasint lenb, lenc; + + if(trans == 'T' || trans == 'C' || trans == 'D' || trans == 'U'){ + lenb = m; + lenc = n; + } else { + lenb = n; + lenc = m; + } + + drand_generate(data_zgbmv.matrix, m * n * 2); + drand_generate(data_zgbmv.b_test, 2 * (1 + (lenb - 1) * inc_b)); + drand_generate(data_zgbmv.c_test, 2 * (1 + (lenc - 1) * inc_c)); + + for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++) + data_zgbmv.c_verify[i] = data_zgbmv.c_test[i]; + + get_band_matrix(m, n, kl, ku, data_zgbmv.a_test, data_zgbmv.matrix, m); + + transform_to_band_storage(m, n, kl, ku, data_zgbmv.a_band_storage, lda, data_zgbmv.a_test, m); + + BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgbmv.a_test, &m, data_zgbmv.b_test, + &inc_b, beta, data_zgbmv.c_verify, &inc_c); + + BLASFUNC(zgbmv)(&trans, &m, &n, &kl, &ku, alpha, data_zgbmv.a_band_storage, &lda, data_zgbmv.b_test, + &inc_b, beta, data_zgbmv.c_test, &inc_c); + + for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++) + data_zgbmv.c_verify[i] -= data_zgbmv.c_test[i]; + + return BLASFUNC(dznrm2)(&lenc, data_zgbmv.c_verify, &inc_c); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is D + */ +CTEST(zgbmv, trans_D) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'D'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is O + */ +CTEST(zgbmv, trans_O) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 10; + blasint lda = 50; + char trans = 'O'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is S + */ +CTEST(zgbmv, trans_S) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 6, ku = 9; + blasint lda = 50; + char trans = 'S'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is U + */ +CTEST(zgbmv, trans_U) +{ + blasint m = 25, n = 50; + blasint inc_b = 1, inc_c = 1; + blasint kl = 7, ku = 11; + blasint lda = kl + ku + 1; + char trans = 'U'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is C + */ +CTEST(zgbmv, trans_C) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'C'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is R + */ +CTEST(zgbmv, trans_R) +{ + blasint m = 50, n = 100; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'R'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} +#endif diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c new file mode 100644 index 000000000..7496ccf88 --- /dev/null +++ b/utest/test_extensions/test_zgeadd.c @@ -0,0 +1,880 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 + +struct DATA_ZGEADD { + double a_test[M * N * 2]; + double c_test[M * N * 2]; + double c_verify[M * N * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZGEADD data_zgeadd; + +/** + * zgeadd reference implementation + * + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param aptr - refer to matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param cptr - refer to matrix C + * param ldc - leading dimension of C + */ +static void zgeadd_trusted(blasint m, blasint n, double *alpha, double *aptr, + blasint lda, double *beta, double *cptr, blasint ldc) +{ + blasint i; + + lda *= 2; + ldc *= 2; + + for (i = 0; i < n; i++) + { + cblas_zaxpby(m, alpha, aptr, 1, beta, cptr, 1); + aptr += lda; + cptr += ldc; + } +} + +/** + * Test zgeadd by comparing it against reference + * Compare with the following options: + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static double check_zgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, double *alpha, blasint lda, + double *beta, blasint ldc) +{ + blasint i; + blasint cols = m, rows = n; + + if (order == CblasRowMajor) + { + rows = m; + cols = n; + } + + // Fill matrix A, C + drand_generate(data_zgeadd.a_test, lda * rows * 2); + drand_generate(data_zgeadd.c_test, ldc * rows * 2); + + // Copy matrix C for zgeadd + for (i = 0; i < ldc * rows * 2; i++) + data_zgeadd.c_verify[i] = data_zgeadd.c_test[i]; + + zgeadd_trusted(cols, rows, alpha, data_zgeadd.a_test, lda, + beta, data_zgeadd.c_verify, ldc); + + if (api == 'F') + BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda, + beta, data_zgeadd.c_test, &ldc); + else + cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda, + beta, data_zgeadd.c_test, ldc); + + // Find the differences between output matrix caculated by zgeadd and sgemm + return dmatrix_difference(data_zgeadd.c_test, data_zgeadd.c_verify, cols, rows, ldc * 2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param lda - leading dimension of A + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in zgeadd + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, blasint lda, + blasint ldc, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + set_xerbla("ZGEADD ", expected_info); + + if (api == 'F') + BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda, + beta, data_zgeadd.c_test, &ldc); + else + cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda, + beta, data_zgeadd.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(zgeadd, matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {3.0, 2.0}; + double beta[] = {1.0, 3.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(zgeadd, matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {0.0, 0.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(zgeadd, matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {3.0, 1.5}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(zgeadd, matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(zgeadd, matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n - + * number of columns of A and C + * Must be at least zero. + */ +CTEST(zgeadd, xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = m; + blasint ldc = m; + + int expected_info = 2; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + */ +CTEST(zgeadd, xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + */ +CTEST(zgeadd, xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + */ +CTEST(zgeadd, xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Check if n - number of columns of A, C equal zero. + */ +CTEST(zgeadd, n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Check if m - number of rows of A and C equal zero. + */ +CTEST(zgeadd, m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 3.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {4.0, 1.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(zgeadd, c_api_matrix_n_50_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N / 2; + blasint m = M; + + blasint lda = n; + blasint ldc = n; + + double alpha[] = {3.0, 2.5}; + double beta[] = {1.0, 2.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {0.0, 0.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {3.0, 1.5}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(zgeadd, c_api_matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {2.0, 3.0}; + double beta[] = {2.0, 4.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test error function for an invalid param order - + * specifies whether A and C stored in + * row-major order or column-major order + */ +CTEST(zgeadd, c_api_xerbla_invalid_order) +{ + CBLAS_ORDER order = INVALID; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 0; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(zgeadd, c_api_xerbla_n_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(zgeadd, c_api_xerbla_m_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(zgeadd, c_api_xerbla_lda_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(zgeadd, c_api_xerbla_ldc_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Check if n - number of columns of A, C equal zero. + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Check if m - number of rows of A and C equal zero. + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zgemm.c b/utest/test_extensions/test_zgemm.c new file mode 100644 index 000000000..4160a5086 --- /dev/null +++ b/utest/test_extensions/test_zgemm.c @@ -0,0 +1,273 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZGEMM { + double a_test[DATASIZE * DATASIZE * 2]; + double a_verify[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * DATASIZE * 2]; + double b_verify[DATASIZE * DATASIZE * 2]; + double c_test[DATASIZE * DATASIZE * 2]; + double c_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZGEMM data_zgemm; + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * + * param transa specifies op(A), the transposition (conjugation) operation applied to A + * param transb specifies op(B), the transposition (conjugation) operation applied to B + * param m specifies the number of rows of the matrix op(A) and of the matrix C + * param n specifies the number of columns of the matrix op(B) and the number of columns of the matrix C + * param k specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of matrix A + * param ldb - leading dimension of matrix B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of matrix C + * return norm of difference + */ +static double check_zgemm(char transa, char transb, blasint m, blasint n, blasint k, + double *alpha, blasint lda, blasint ldb, double *beta, blasint ldc) +{ + blasint i; + double alpha_conj[] = {1.0, 0.0}; + char transa_verify = transa; + char transb_verify = transb; + + int arows = k, acols = m; + int brows = n, bcols = k; + + if (transa == 'T' || transa == 'C'){ + arows = m; acols = k; + } + + if (transb == 'T' || transb == 'C'){ + brows = k; bcols = n; + } + + drand_generate(data_zgemm.a_test, arows * lda * 2); + drand_generate(data_zgemm.b_test, brows * ldb * 2); + drand_generate(data_zgemm.c_test, n * ldc * 2); + + for (i = 0; i < arows * lda * 2; i++) + data_zgemm.a_verify[i] = data_zgemm.a_test[i]; + + for (i = 0; i < brows * ldb * 2; i++) + data_zgemm.b_verify[i] = data_zgemm.b_test[i]; + + for (i = 0; i < n * ldc * 2; i++) + data_zgemm.c_verify[i] = data_zgemm.c_test[i]; + + if (transa == 'R'){ + cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_zgemm.a_verify, lda, lda); + transa_verify = 'N'; + } + + if (transb == 'R'){ + cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_zgemm.b_verify, ldb, ldb); + transb_verify = 'N'; + } + + BLASFUNC(zgemm)(&transa_verify, &transb_verify, &m, &n, &k, alpha, data_zgemm.a_verify, &lda, + data_zgemm.b_verify, &ldb, beta, data_zgemm.c_verify, &ldc); + + BLASFUNC(zgemm)(&transa, &transb, &m, &n, &k, alpha, data_zgemm.a_test, &lda, + data_zgemm.b_test, &ldb, beta, data_zgemm.c_test, &ldc); + + return dmatrix_difference(data_zgemm.c_test, data_zgemm.c_verify, m, n, ldc*2); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and transposed + * matrix B is conjugate and not transposed + */ +CTEST(zgemm, conjtransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'C'; + char transb = 'R'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is not conjugate and not transposed + * matrix B is conjugate and not transposed + */ +CTEST(zgemm, notransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'N'; + char transb = 'R'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is conjugate and transposed + */ +CTEST(zgemm, conjnotransa_conjtransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'C'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is not conjugate and not transposed + */ +CTEST(zgemm, conjnotransa_notransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'N'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is conjugate and not transposed + */ +CTEST(zgemm, conjnotransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'R'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is transposed + */ +CTEST(zgemm, conjnotransa_transb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'T'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is transposed + * matrix B is conjugate and not transposed + */ +CTEST(zgemm, transa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'T'; + char transb = 'R'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zgemmt.c b/utest/test_extensions/test_zgemmt.c new file mode 100644 index 000000000..c55381008 --- /dev/null +++ b/utest/test_extensions/test_zgemmt.c @@ -0,0 +1,2010 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_ZGEMMT { + double a_test[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * DATASIZE * 2]; + double c_test[DATASIZE * DATASIZE * 2]; + double c_verify[DATASIZE * DATASIZE * 2]; + double c_gemm[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZGEMMT data_zgemmt; + +/** + * Compute gemmt via gemm since gemmt is gemm but updates only + * the upper or lower triangular part of the result matrix + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + */ +static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, double *alpha, blasint lda, + blasint ldb, double *beta, blasint ldc) +{ + blasint i, j; + + if(api == 'F') + BLASFUNC(zgemm)(&transa, &transb, &m, &m, &k, alpha, data_zgemmt.a_test, &lda, + data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_gemm, &ldc); + else + cblas_zgemm(order, transa, transb, m, m, k, alpha, data_zgemmt.a_test, lda, + data_zgemmt.b_test, ldb, beta, data_zgemmt.c_gemm, ldc); + + ldc *= 2; + + if (uplo == 'L' || uplo == CblasLower) + { + for (i = 0; i < m; i++) + for (j = i * 2; j < m * 2; j+=2){ + data_zgemmt.c_verify[i * ldc + j] = + data_zgemmt.c_gemm[i * ldc + j]; + data_zgemmt.c_verify[i * ldc + j + 1] = + data_zgemmt.c_gemm[i * ldc + j + 1]; + } + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i * 2; j+=2){ + data_zgemmt.c_verify[i * ldc + j] = + data_zgemmt.c_gemm[i * ldc + j]; + data_zgemmt.c_verify[i * ldc + j + 1] = + data_zgemmt.c_gemm[i * ldc + j + 1]; + } + } +} + +/** + * Comapare results computed by zgemmt and zgemmt_trusted + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static double check_zgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, double *alpha, blasint lda, + blasint ldb, double *beta, blasint ldc) +{ + blasint i; + blasint b_cols; + blasint a_cols; + blasint inc = 1; + blasint size_c = m * ldc * 2; + + if(order == CblasColMajor){ + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = m; + else a_cols = k; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = k; + else b_cols = m; + } else { + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = k; + else a_cols = m; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = m; + else b_cols = k; + } + + drand_generate(data_zgemmt.a_test, a_cols * lda * 2); + drand_generate(data_zgemmt.b_test, b_cols * ldb * 2); + drand_generate(data_zgemmt.c_test, m * ldc * 2); + + for (i = 0; i < m * ldc * 2; i++) + data_zgemmt.c_gemm[i] = data_zgemmt.c_verify[i] = data_zgemmt.c_test[i]; + + zgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc); + + if (api == 'F') + BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test, + &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc); + else + cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda, + data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc); + + for (i = 0; i < m * ldc * 2; i++) + data_zgemmt.c_verify[i] -= data_zgemmt.c_test[i]; + + return BLASFUNC(dnrm2)(&size_c, data_zgemmt.c_verify, &inc) / size_c; +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in zgemmt + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, blasint lda, blasint ldb, + blasint ldc, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + set_xerbla("ZGEMMT ", expected_info); + + if (api == 'F') + BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test, + &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc); + else + cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda, + data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + char transa = 'N', transb = 'T'; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'U'; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + char transa = 'R', transb = 'R'; + char uplo = 'U'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'R'; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'R', transb = 'C'; + char uplo = 'U'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, upper_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'C'; + char uplo = 'U'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + char transa = 'N', transb = 'T'; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'L'; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + char transa = 'R', transb = 'R'; + char uplo = 'L'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'R'; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'R', transb = 'C'; + char uplo = 'L'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'C'; + char uplo = 'L'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, c_api_colmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, c_api_colmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, c_api_colmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, c_api_colmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 50, ldc = 25; + double alpha[] = {1.0, 1.0}; + double beta[] = {-1.0, -1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 25, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjtrans_b_conjtrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, c_api_rowmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, c_api_rowmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 50, ldc = 25; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 25, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 25, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, c_api_rowmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, c_api_rowmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param uplo. + * Must be upper (U) or lower (L). + */ +CTEST(zgemmt, xerbla_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transa. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(zgemmt, xerbla_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'O', transb = 'N'; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transb. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(zgemmt, xerbla_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'O'; + char uplo = 'U'; + int expected_info = 3; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(zgemmt, xerbla_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 4; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(zgemmt, xerbla_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 5; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(zgemmt, xerbla_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 8; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(zgemmt, xerbla_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 10; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(zgemmt, xerbla_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 13; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. + * Test error function for an invalid param order. + * Must be column or row major. + */ +CTEST(zgemmt, xerbla_c_api_major_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 0; + + int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B transposed. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zgemv_n.c b/utest/test_extensions/test_zgemv_n.c new file mode 100644 index 000000000..903b855e1 --- /dev/null +++ b/utest/test_extensions/test_zgemv_n.c @@ -0,0 +1,341 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZSPMV_N { + double a_test[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * 2 * INCREMENT]; + double c_test[DATASIZE * 2 * INCREMENT]; + double c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX16 + +static struct DATA_ZSPMV_N data_zgemv_n; + +/** + * zgemv not transposed reference code + * + * param trans specifies whether matris A is conj or/and xconj + * param m - number of rows of A + * param n - number of columns of A + * param alpha - scaling factor for the matrib-vector product + * param a - buffer holding input matrib A + * param lda - leading dimension of matrix A + * param b - Buffer holding input vector b + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param c - buffer holding input/output vector c + * param inc_c - stride of vector c + */ +static void zgemv_n_trusted(char trans, blasint m, blasint n, double *alpha, double *a, + blasint lda, double *b, blasint inc_b, double *beta, double *c, + blasint inc_c) +{ + blasint i, j; + blasint i2 = 0; + blasint ib = 0, ic = 0; + + double temp_r, temp_i; + + double *a_ptr = a; + blasint lda2 = 2*lda; + + blasint inc_b2 = 2 * inc_b; + blasint inc_c2 = 2 * inc_c; + + BLASFUNC(zscal)(&m, beta, c, &inc_c); + + for (j = 0; j < n; j++) + { + + if (trans == 'N' || trans == 'R') { + temp_r = alpha[0] * b[ib] - alpha[1] * b[ib+1]; + temp_i = alpha[0] * b[ib+1] + alpha[1] * b[ib]; + } else { + temp_r = alpha[0] * b[ib] + alpha[1] * b[ib+1]; + temp_i = alpha[0] * b[ib+1] - alpha[1] * b[ib]; + } + + ic = 0; + i2 = 0; + + for (i = 0; i < m; i++) + { + if (trans == 'N') { + c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; + c[ic+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; + } + if (trans == 'O') { + c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; + c[ic+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; + } + if (trans == 'R') { + c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; + c[ic+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; + } + if (trans == 'S') { + c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; + c[ic+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; + } + i2 += 2; + ic += inc_c2; + } + a_ptr += lda2; + ib += inc_b2; + } + +} + +/** + * Comapare results computed by zgemv and zgemv_n_trusted + * + * param trans specifies whether matris A is conj or/and xconj + * param m - number of rows of A + * param n - number of columns of A + * param alpha - scaling factor for the matrib-vector product + * param lda - leading dimension of matrix A + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static double check_zgemv_n(char trans, blasint m, blasint n, double *alpha, blasint lda, + blasint inc_b, double *beta, blasint inc_c) +{ + blasint i; + + drand_generate(data_zgemv_n.a_test, n * lda); + drand_generate(data_zgemv_n.b_test, 2 * n * inc_b); + drand_generate(data_zgemv_n.c_test, 2 * m * inc_c); + + for (i = 0; i < m * 2 * inc_c; i++) + data_zgemv_n.c_verify[i] = data_zgemv_n.c_test[i]; + + zgemv_n_trusted(trans, m, n, alpha, data_zgemv_n.a_test, lda, data_zgemv_n.b_test, + inc_b, beta, data_zgemv_n.c_test, inc_c); + BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_n.a_test, &lda, data_zgemv_n.b_test, + &inc_b, beta, data_zgemv_n.c_verify, &inc_c); + + for (i = 0; i < m * 2 * inc_c; i++) + data_zgemv_n.c_verify[i] -= data_zgemv_n.c_test[i]; + + return BLASFUNC(dznrm2)(&n, data_zgemv_n.c_verify, &inc_c); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_o_square_matrix) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows of A is 50 + * Number of colums of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_o_rectangular_matrix_rows_less_then_cols) +{ + blasint n = 100, m = 50, lda = 50; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows of A is 100 + * Number of colums of A is 50 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_o_rectangular_matrix_cols_less_then_rows) +{ + blasint n = 50, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(zgemv, trans_o_double_strides) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 2, inc_c = 2; + char trans = 'O'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_s_square_matrix) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows of A is 50 + * Number of colums of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_s_rectangular_matrix_rows_less_then_cols) +{ + blasint n = 100, m = 50, lda = 50; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows of A is 100 + * Number of colums of A is 50 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_s_rectangular_matrix_cols_less_then_rows) +{ + blasint n = 50, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 0.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(zgemv, trans_s_double_strides) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 2, inc_c = 2; + char trans = 'S'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.0, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +#endif diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c new file mode 100644 index 000000000..b2d0b2713 --- /dev/null +++ b/utest/test_extensions/test_zgemv_t.c @@ -0,0 +1,1136 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 +#define INCREMENT 2 + +struct DATA_ZGEMV_T { + double a_test[N * M * 2]; + double a_verify[N * M * 2]; + double y_test[M * INCREMENT * 2]; + double y_verify[M * INCREMENT * 2]; + double x_test[M * INCREMENT * 2]; + double x_verify[M * INCREMENT * 2]; +}; + +// DOUBLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * DBL_EPSILON +// DOUBLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 2.2e-16 = 1e-11 +#define DOUBLE_EPS_ZGEMV 1e-11 + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZGEMV_T data_zgemv_t; + +/** + * Find product of matrix-vector multiplication + * + * param n specifies number of columns of A + * param m specifies number of rows of A and size of vector x + * param lda specifies leading dimension of A + * param inc_x specifies increment of vector x + */ +static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x) +{ + blasint i; + double *a_ptr = data_zgemv_t.a_verify; + double *x_ptr = data_zgemv_t.x_test; + double *x_res = data_zgemv_t.x_verify; + + openblas_complex_double result; + + for (i = 0; i < n * inc_x; i += inc_x) + { + result = cblas_zdotu(lda, a_ptr, 1, x_ptr, inc_x); + x_res[0] = CREAL(result); + x_res[1] = CIMAG(result); + a_ptr += lda * 2; + x_res += 2 * inc_x; + } +} + +/** + * Test zgemv by comparing it against zomatcopy, zaxpby and + * reference func matrix_vector_product + * + * zomatcopy perform operation: op(A) + * matrix_vector_product perform operation: A*x + * zaxpby perform operation: alpha*x + beta*y + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param alpha specifies scalar alpha + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param beta specifies scalar beta + * param inc_y specifies increment for vector y + * return norm of difference between zgemv and result of reference funcs + */ +static double check_zgemv(char api, char order, char trans, blasint m, blasint n, double *alpha, + blasint lda, blasint inc_x, double *beta, blasint inc_y) +{ + blasint i; + + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + // Transpose parameters for zomatcopy + // zgemv_t perform operation on transposed matrix, no need to transpose a_verify + char trans_copy; + char ctrans_copy; + + // Param alpha for zomatcopy, scale on alpha perform zaxpby + double alpha_one[] = {1.0, 0.0}; + + memset(data_zgemv_t.x_verify, 0.0, m * inc_x * 2 * sizeof(double)); + + // Fill matrix A, vectors x, y + drand_generate(data_zgemv_t.a_test, lda * n * 2); + drand_generate(data_zgemv_t.x_test, m * inc_x * 2); + drand_generate(data_zgemv_t.y_test, m * inc_y * 2); + + // Copy vector y for reference funcs + for (i = 0; i < m * inc_y * 2; i++) + { + data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i]; + } + + if (api == 'F') { + if (trans == 'T') trans_copy = 'N'; + if (trans == 'C') trans_copy = 'R'; + if (trans == 'U') trans_copy = 'R'; + if (trans == 'D') trans_copy = 'N'; + + // Perform operation: op(A) + BLASFUNC(zomatcopy)(&order, &trans_copy, &m, &n, alpha_one, + data_zgemv_t.a_test, &lda, data_zgemv_t.a_verify, &lda); + + // Find A*x + matrix_vector_product(n, m, lda, inc_x); + + // Find conj(x) + if (trans == 'U' || trans == 'D') + { + zconjugate_vector(m, inc_x, data_zgemv_t.x_verify); + } + + // Find alpha*x+beta*y + BLASFUNC(zaxpby)(&n, alpha, data_zgemv_t.x_verify, &inc_x, beta, + data_zgemv_t.y_verify, &inc_y); + + BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_t.a_test, &lda, + data_zgemv_t.x_test, &inc_x, beta, data_zgemv_t.y_test, &inc_y); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') {ctrans = CblasTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasTrans : CblasNoTrans;} + if (trans == 'N') {ctrans = CblasNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasNoTrans : CblasTrans;} + if (trans == 'C') {ctrans = CblasConjTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjTrans : CblasConjNoTrans;} + if (trans == 'R') {ctrans = CblasConjNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjNoTrans : CblasConjTrans;} + + // Perform operation: op(A) + cblas_zomatcopy(corder, ctrans_copy, m, n, alpha_one, data_zgemv_t.a_test, lda, data_zgemv_t.a_verify, lda); + + // Find A*x + matrix_vector_product(n, m, lda, inc_x); + + // Find alpha*x+beta*y + cblas_zaxpby(n, alpha, data_zgemv_t.x_verify, inc_x, beta, data_zgemv_t.y_verify, inc_y); + + cblas_zgemv(corder, ctrans, m, n, alpha, data_zgemv_t.a_test, + lda, data_zgemv_t.x_test, inc_x, beta, data_zgemv_t.y_test, inc_y); + } + + // Find the differences between output vector caculated by zgemv and reference funcs + for (i = 0; i < m * inc_y * 2; i++) + data_zgemv_t.y_test[i] -= data_zgemv_t.y_verify[i]; + + // Find the norm of differences + return cblas_dznrm2(m, data_zgemv_t.y_test, inc_y); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param inc_y specifies increment for vector y + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint m, blasint n, + blasint lda, blasint inc_x, blasint inc_y, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double a[] = {1.0, 1.0}; + double x[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + double y[] = {1.0, 1.0}; + + set_xerbla("ZGEMV ", expected_info); + + BLASFUNC(zgemv)(&trans, &m, &n, alpha, a, &lda, x, + &inc_x, beta, y, &inc_y); + + return check_error(); +} + +/** + * C API specific function + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param inc_y specifies increment for vector y + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int c_api_check_badargs(CBLAS_ORDER corder, CBLAS_TRANSPOSE ctrans, blasint m, blasint n, + blasint lda, blasint inc_x, blasint inc_y, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double a[] = {1.0, 1.0}; + double x[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + double y[] = {1.0, 1.0}; + + set_xerbla("ZGEMV ", expected_info); + + cblas_zgemv(corder, ctrans, m, n, alpha, a, lda, x, inc_x, beta, y, inc_y); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 2.0 + */ +CTEST(zgemv, colmajor_trans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 2.0 + */ +CTEST(zgemv, colmajor_trans_col_100_row_100_inc_x_2_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 2.0}; + + blasint inc_x = 2; + blasint inc_y = 1; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 1.0 + */ +CTEST(zgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 2 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 1.0 + */ +CTEST(zgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + blasint inc_x = 1; + blasint inc_y = 2; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and x conjugate + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 1.0 + */ +CTEST(zgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and x conjugate + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 1.0, alpha_i = 2.0 + * beta_r = 1.0, beta_i = 1.0 + */ +CTEST(zgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_2_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'U'; + + double alpha[] = {1.0, 2.0}; + double beta[] = {1.0, 1.0}; + + blasint inc_x = 2; + blasint inc_y = 2; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition, conjugate A, conjugate x + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 2.0 + */ +CTEST(zgemv, colmajor_conjtrans_x_conj_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'D'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 2; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition, conjugate A, conjugate x + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 2.0 + */ +CTEST(zgemv, c_api_colmajor_trans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 2.0 + */ +CTEST(zgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 2 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 2.0 + */ +CTEST(zgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 2; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Row Major + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 1.0 + */ +CTEST(zgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'N'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 1.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Row Major + * No trans + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 3.0, beta_i = 2.0 + */ +CTEST(zgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_2_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'N'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {3.0, 1.0}; + + blasint inc_x = 2; + blasint inc_y = 2; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Conjugate + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 1.0, alpha_i = 3.0 + * beta_r = 1.0, beta_i = 2.5 + */ +CTEST(zgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'R'; + + double alpha[] = {1.0, 3.0}; + double beta[] = {1.0, 2.5}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Row Major + * Conjugate + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 1.5 + */ +CTEST(zgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_2_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'R'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.5}; + + blasint inc_x = 2; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_inc_y) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_inc_y_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_inc_y_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_inc_x) +{ + char order = 'C'; + char trans = 'T'; + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_inc_x_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_inc_x_row_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_n) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_n_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_n_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_m) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_m_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_m_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * lda must be at least n. + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_lda) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda. + * If matrices are stored using col major layout, + * lda must be at least m. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_lda_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda. + * If matrices are stored using col major layout, + * lda must be at least n. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_lda_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param trans. + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_trans) +{ + char order = 'C'; + char trans = 'Z'; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param trans. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_trans_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = INVALID; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param trans. + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_trans_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = INVALID; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param order. + */ +CTEST(zgemv, c_api_xerbla_invalid_order_col_major) +{ + enum CBLAS_ORDER corder = INVALID; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 0; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c new file mode 100644 index 000000000..8376bc493 --- /dev/null +++ b/utest/test_extensions/test_zimatcopy.c @@ -0,0 +1,818 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_ZIMATCOPY { + double a_test[DATASIZE * DATASIZE * 2]; + double a_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZIMATCOPY data_zimatcopy; + +/** + * Comapare results computed by zimatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * return norm of difference between openblas and reference func + */ +static double check_zimatcopy(char api, char order, char trans, blasint rows, blasint cols, double *alpha, + blasint lda_src, blasint lda_dst) +{ + blasint m, n; + blasint rows_out, cols_out; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + int conj = -1; + + if (order == 'C') { + n = rows; m = cols; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + rows_out = n; cols_out = m*2; + if (trans == 'C') + conj = 1; + } + else { + rows_out = m; cols_out = n*2; + if (trans == 'R') + conj = 1; + } + + drand_generate(data_zimatcopy.a_test, lda_src*m*2); + + if (trans == 'T' || trans == 'C') { + ztranspose(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); + } + else { + zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); + } + + if (api == 'F') { + BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, + &lda_src, &lda_dst); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_zimatcopy(corder, ctrans, rows, cols, alpha, data_zimatcopy.a_test, + lda_src, lda_dst); + } + + // Find the differences between output matrix computed by zimatcopy and reference func + return dmatrix_difference(data_zimatcopy.a_test, data_zimatcopy.a_verify, cols_out, rows_out, lda_dst*2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda_src, blasint lda_dst, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + + set_xerbla("ZIMATCOPY", expected_info); + + BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, + &lda_src, &lda_dst); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha_r = -3.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha[] = {-3.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, colmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, colmajor_conjtrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'C'; + double alpha[] = {1.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_conj_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 50; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha_r = 2.0, alpha_i = 3.0 + */ +CTEST(zimatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha[] = {2.0, 3.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + double alpha[] = {2.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, rowmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zimatcopy, rowmajor_conj_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 50; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.5, -1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, rowmajor_conjtrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha_r = 3.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha[] = {3.0, 2.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha_r = 3.0, alpha_i = 1.5 + */ +CTEST(zimatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha[] = {3.0, 1.5}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha_r = 3.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha[] = {3.0, 1.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, c_api_colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, c_api_colmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha_r = 1.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha[] = {1.0, 1.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zimatcopy, c_api_rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.5, -1.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, c_api_rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(zimatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(zimatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using row major layout, + * lda_src must be at least n. + */ +CTEST(zimatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using column major layout, + * lda_src must be at least m. + */ +CTEST(zimatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is no transposition, lda_dst must be at least n. + */ +CTEST(zimatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is transposition, lda_dst must be at least m. + */ +CTEST(zimatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is no transposition, lda_dst must be at least m. + */ +CTEST(zimatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is transposition, lda_dst must be at least n. + */ +CTEST(zimatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c new file mode 100644 index 000000000..495831c56 --- /dev/null +++ b/utest/test_extensions/test_zomatcopy.c @@ -0,0 +1,713 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_ZOMATCOPY { + double a_test[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * DATASIZE * 2]; + double b_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZOMATCOPY data_zomatcopy; + +/** + * Comapare results computed by zomatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param alpha - scaling factor for matrix B + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * return norm of difference between openblas and reference func + */ +static double check_zomatcopy(char api, char order, char trans, blasint rows, blasint cols, double* alpha, + blasint lda, blasint ldb) +{ + blasint b_rows, b_cols; + blasint m, n; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + int conj = -1; + + if (order == 'C') { + m = cols; n = rows; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + b_rows = n; b_cols = m*2; + if (trans == 'C') + conj = 1; + } + else { + b_rows = m; b_cols = n*2; + if (trans == 'R') + conj = 1; + } + + drand_generate(data_zomatcopy.a_test, lda*m*2); + + if (trans == 'T' || trans == 'C') { + ztranspose(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); + } + else { + zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); + } + + if (api == 'F') { + BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, + &lda, data_zomatcopy.b_test, &ldb); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_zomatcopy(corder, ctrans, rows, cols, alpha, data_zomatcopy.a_test, + lda, data_zomatcopy.b_test, ldb); + } + + return dmatrix_difference(data_zomatcopy.b_test, data_zomatcopy.b_verify, b_cols, b_rows, ldb*2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda, blasint ldb, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + + set_xerbla("ZOMATCOPY", expected_info); + + BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, + &lda, data_zomatcopy.b_test, &ldb); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * alpha_r = -1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + double alpha[] = {-1.0, 2.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zomatcopy, colmajor_conjtrnas_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, c_api_colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * alpha_r = -1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, c_api_colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + double alpha[] = {-1.0, 2.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, c_api_colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zomatcopy, c_api_colmajor_conjtrnas_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, c_api_rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, c_api_rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, c_api_rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, c_api_rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** +* Test error function for an invalid param order. +* Must be column (C) or row major (R). +*/ +CTEST(zomatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param trans. +* Must be trans (T/C) or no-trans (N/R). +*/ +CTEST(zomatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param lda. +* If matrices are stored using row major layout, +* lda must be at least n. +*/ +CTEST(zomatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param lda. +* If matrices are stored using column major layout, +* lda must be at least m. +*/ +CTEST(zomatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using row major layout and +* there is no transposition, ldb must be at least n. +*/ +CTEST(zomatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using row major layout and +* there is transposition, ldb must be at least m. +*/ +CTEST(zomatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using row major layout and +* there is no transposition, ldb must be at least n. +*/ +CTEST(zomatcopy, xerbla_rowmajor_conj_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'R'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using row major layout and +* there is transposition, ldb must be at least m. +*/ +CTEST(zomatcopy, xerbla_rowmajor_transconj_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'C'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using column major layout and +* there is no transposition, ldb must be at least m. +*/ +CTEST(zomatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using column major layout and +* there is transposition, ldb must be at least n. +*/ +CTEST(zomatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using column major layout and +* there is no transposition, ldb must be at least m. +*/ +CTEST(zomatcopy, xerbla_colmajor_conj_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'R'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using column major layout and +* there is transposition, ldb must be at least n. +*/ +CTEST(zomatcopy, xerbla_colmajor_transconj_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'C'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_zrot.c b/utest/test_extensions/test_zrot.c new file mode 100644 index 000000000..5471e051a --- /dev/null +++ b/utest/test_extensions/test_zrot.c @@ -0,0 +1,790 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZROT { + double x_test[DATASIZE * INCREMENT * 2]; + double y_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; + double y_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZROT data_zrot; + +/** + * Comapare results computed by zdrot and zaxpby + * + * param n specifies size of vector x + * param inc_x specifies increment of vector x + * param inc_y specifies increment of vector y + * param c specifies cosine + * param s specifies sine + * return norm of differences + */ +static double check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, double *s) +{ + blasint i; + double norm = 0; + double s_neg[] = {-s[0], s[1]}; + + blasint inc_x_abs = labs(inc_x); + blasint inc_y_abs = labs(inc_y); + + // Fill vectors x, y + drand_generate(data_zrot.x_test, n * inc_x_abs * 2); + drand_generate(data_zrot.y_test, n * inc_y_abs * 2); + + if (inc_x == 0 && inc_y == 0) { + drand_generate(data_zrot.x_test, n * 2); + drand_generate(data_zrot.y_test, n * 2); + } + + // Copy vector x for zaxpby + for (i = 0; i < n * inc_x_abs * 2; i++) + data_zrot.x_verify[i] = data_zrot.x_test[i]; + + // Copy vector y for zaxpby + for (i = 0; i < n * inc_y_abs * 2; i++) + data_zrot.y_verify[i] = data_zrot.y_test[i]; + + // Find cx = c*x + s*y + BLASFUNC(zaxpby)(&n, s, data_zrot.y_test, &inc_y, c, data_zrot.x_verify, &inc_x); + + // Find cy = -conjg(s)*x + c*y + BLASFUNC(zaxpby)(&n, s_neg, data_zrot.x_test, &inc_x, c, data_zrot.y_verify, &inc_y); + + BLASFUNC(zdrot)(&n, data_zrot.x_test, &inc_x, data_zrot.y_test, &inc_y, c, s); + + // Find the differences between vector x caculated by zaxpby and zdrot + for (i = 0; i < n * 2 * inc_x_abs; i++) + data_zrot.x_test[i] -= data_zrot.x_verify[i]; + + // Find the differences between vector y caculated by zaxpby and zdrot + for (i = 0; i < n * 2 * inc_y_abs; i++) + data_zrot.y_test[i] -= data_zrot.y_verify[i]; + + // Find the norm of differences + norm += BLASFUNC(dznrm2)(&n, data_zrot.x_test, &inc_x_abs); + norm += BLASFUNC(dznrm2)(&n, data_zrot.y_test, &inc_y_abs); + return (norm / 2); +} + +/** + * C API specific function + * Comapare results computed by zdrot and zaxpby + * + * param n specifies size of vector x + * param inc_x specifies increment of vector x + * param inc_y specifies increment of vector y + * param c specifies cosine + * param s specifies sine + * return norm of differences + */ +static double c_api_check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, double *s) +{ + blasint i; + double norm = 0; + double s_neg[] = {-s[0], s[1]}; + + blasint inc_x_abs = labs(inc_x); + blasint inc_y_abs = labs(inc_y); + + // Fill vectors x, y + drand_generate(data_zrot.x_test, n * inc_x_abs * 2); + drand_generate(data_zrot.y_test, n * inc_y_abs * 2); + + if (inc_x == 0 && inc_y == 0) { + drand_generate(data_zrot.x_test, n * 2); + drand_generate(data_zrot.y_test, n * 2); + } + + // Copy vector x for zaxpby + for (i = 0; i < n * inc_x_abs * 2; i++) + data_zrot.x_verify[i] = data_zrot.x_test[i]; + + // Copy vector y for zaxpby + for (i = 0; i < n * inc_y_abs * 2; i++) + data_zrot.y_verify[i] = data_zrot.y_test[i]; + + // Find cx = c*x + s*y + cblas_zaxpby(n, s, data_zrot.y_test, inc_y, c, data_zrot.x_verify, inc_x); + + // Find cy = -conjg(s)*x + c*y + cblas_zaxpby(n, s_neg, data_zrot.x_test, inc_x, c, data_zrot.y_verify, inc_y); + + cblas_zdrot(n, data_zrot.x_test, inc_x, data_zrot.y_test, inc_y, c[0], s[0]); + + // Find the differences between vector x caculated by zaxpby and zdrot + for (i = 0; i < n * 2 * inc_x_abs; i++) + data_zrot.x_test[i] -= data_zrot.x_verify[i]; + + // Find the differences between vector y caculated by zaxpby and zdrot + for (i = 0; i < n * 2 * inc_y_abs; i++) + data_zrot.y_test[i] -= data_zrot.y_verify[i]; + + // Find the norm of differences + norm += cblas_dznrm2(n, data_zrot.x_test, inc_x_abs); + norm += cblas_dznrm2(n, data_zrot.y_test, inc_y_abs); + return (norm / 2); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 0 + * Stride of vector y is 0 + * c = 1.0 + * s = 2.0 + */ +CTEST(zrot, inc_x_0_inc_y_0) +{ + blasint n = 100; + + blasint inc_x = 0; + blasint inc_y = 0; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_1_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is -1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_neg_1_inc_y_neg_1) +{ + blasint n = 100; + + blasint inc_x = -1; + blasint inc_y = -1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * c = 3.0 + * s = 2.0 + */ +CTEST(zrot, inc_x_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {3.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_neg_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_1_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is -2 + * c = 2.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_1_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = -2; + + // Imaginary part for zaxpby + double c[] = {2.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 2.0 + */ +CTEST(zrot, inc_x_2_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_neg_2_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = -2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 0.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_2_inc_y_2_c_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {0.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 0.0 + */ +CTEST(zrot, inc_x_2_inc_y_2_s_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {0.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 0 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, check_n_zero) +{ + blasint n = 0; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 0 + * Stride of vector y is 0 + * c = 1.0 + * s = 2.0 + */ +CTEST(zrot, c_api_inc_x_0_inc_y_0) +{ + blasint n = 100; + + blasint inc_x = 0; + blasint inc_y = 0; + + // Imaginary part for zaxpby + double c[] = {3.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_1_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is -1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_neg_1_inc_y_neg_1) +{ + blasint n = 100; + + blasint inc_x = -1; + blasint inc_y = -1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * c = 3.0 + * s = 2.0 + */ +CTEST(zrot, c_api_inc_x_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {3.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_neg_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_1_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is -2 + * c = 2.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_1_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = -2; + + // Imaginary part for zaxpby + double c[] = {2.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 2.0 + */ +CTEST(zrot, c_api_inc_x_2_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_neg_2_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = -2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 0.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_2_inc_y_2_c_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {0.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 0.0 + */ +CTEST(zrot, c_api_inc_x_2_inc_y_2_s_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {0.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 0 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_check_n_zero) +{ + blasint n = 0; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zrotg.c b/utest/test_extensions/test_zrotg.c new file mode 100644 index 000000000..1de95447d --- /dev/null +++ b/utest/test_extensions/test_zrotg.c @@ -0,0 +1,290 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#ifdef BUILD_COMPLEX16 + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, zero_a) +{ + double sa[2] = {0.0, 0.0}; + double sb[2] = {1.0, 1.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70710678118655, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.70710678118655, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421356237310, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific tests + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, zero_b) +{ + double sa[2] = {1.0, 1.0}; + double sb[2] = {0.0, 0.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(1.0, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, zero_real) +{ + double sa[2] = {0.0, 1.0}; + double sb[2] = {0.0, 1.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.70710678118654, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70710678118654, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421356237309, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, positive_real_positive_img) +{ + double sa[2] = {3.0, 4.0}; + double sb[2] = {4.0, 6.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, negative_real_positive_img) +{ + double sa[2] = {-3.0, 4.0}; + double sb[2] = {-4.0, 6.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, positive_real_negative_img) +{ + double sa[2] = {3.0, -4.0}; + double sb[2] = {4.0, -6.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, negative_real_negative_img) +{ + double sa[2] = {-3.0, -4.0}; + double sb[2] = {-4.0, -6.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_zero_a) +{ + double sa[2] = {0.0, 0.0}; + double sb[2] = {1.0, 1.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70710678118655, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.70710678118655, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421356237310, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_zero_b) +{ + double sa[2] = {1.0, 1.0}; + double sb[2] = {0.0, 0.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(1.0, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_zero_real) +{ + double sa[2] = {0.0, 1.0}; + double sb[2] = {0.0, 1.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.70710678118654, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70710678118654, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421356237309, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_positive_real_positive_img) +{ + double sa[2] = {3.0, 4.0}; + double sb[2] = {4.0, 6.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_negative_real_positive_img) +{ + double sa[2] = {-3.0, 4.0}; + double sb[2] = {-4.0, 6.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_positive_real_negative_img) +{ + double sa[2] = {3.0, -4.0}; + double sb[2] = {4.0, -6.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_negative_real_negative_img) +{ + double sa[2] = {-3.0, -4.0}; + double sb[2] = {-4.0, -6.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zsbmv.c b/utest/test_extensions/test_zsbmv.c new file mode 100644 index 000000000..0e79dc0d8 --- /dev/null +++ b/utest/test_extensions/test_zsbmv.c @@ -0,0 +1,607 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZSBMV { + double sp_matrix[DATASIZE * (DATASIZE + 1)]; + double sb_matrix[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * 2 * INCREMENT]; + double c_test[DATASIZE * 2 * INCREMENT]; + double c_verify[DATASIZE * 2 * INCREMENT]; +}; + +// DOUBLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * DBL_EPSILON +// DOUBLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 2.2e-16 = 1e-11 +#define DOUBLE_EPS_ZGEMV 1e-11 + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZSBMV data_zsbmv; + +/** + * Transform full-storage symmetric band matrix A to upper (U) or lower (L) + * band-packed storage mode. + * + * param uplo specifies whether matrix a is upper or lower band-packed. + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * output param a - buffer for holding symmetric band-packed matrix + * param lda - specifies the leading dimension of a + * param sb_matrix - buffer holding full-storage symmetric band matrix A + * param ldm - specifies the leading dimension of A + */ +static void transform_to_band_storage(char uplo, blasint n, blasint k, double* a, blasint lda, + double* sb_matrix, blasint ldm) +{ + blasint i, j, m; + if (uplo == 'L') { + for (j = 0; j < n; j++) + { + m = -j; + for (i = 2 * j; i < MIN(2 * n, 2 * (j + k + 1)); i += 2) + { + a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2]; + a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2]; + } + } + } + else { + for (j = 0; j < n; j++) + { + m = k - j; + for (i = MAX(0, 2*(j - k)); i <= j*2; i += 2) + { + a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2]; + a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2]; + } + } + } +} + +/** + * Generate full-storage symmetric band matrix A with k - super-diagonals + * from input symmetric packed matrix in lower packed mode (L) + * + * output param sb_matrix - buffer for holding full-storage symmetric band matrix. + * param sp_matrix - buffer holding input symmetric packed matrix + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + */ +static void get_symmetric_band_matr(double *sb_matrix, double *sp_matrix, blasint n, blasint k) +{ + blasint m; + blasint i, j; + m = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < n * 2; j += 2) + { + // Make matrix band with k super-diagonals + if (fabs((i+1) - ceil((j+1)/2.0)) > k) + { + sb_matrix[i * n * 2 + j] = 0.0; + sb_matrix[i * n * 2 + j + 1] = 0.0; + continue; + } + + if (j / 2 < i) + { + sb_matrix[i * n * 2 + j] = + sb_matrix[j * n + i * 2]; + sb_matrix[i * n * 2 + j + 1] = + sb_matrix[j * n + i * 2 + 1]; + } + else + { + sb_matrix[i * n * 2 + j] = sp_matrix[m++]; + sb_matrix[i * n * 2 + j + 1] = sp_matrix[m++]; + } + } + } +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether matrix a is upper or lower band-packed. + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b_test + * param inc_c - stride of vector c_test + * param expected_info - expected invalid parameter number in zsbmv + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char uplo, blasint n, blasint k, blasint lda, blasint inc_b, + blasint inc_c, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double a[2]; + drand_generate(a, 2); + + set_xerbla("ZSBMV ", expected_info); + + BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, data_zsbmv.b_test, + &inc_b, beta, data_zsbmv.c_test, &inc_c); + + return check_error(); +} + +/** + * Comapare results computed by zsbmv and zgemv + * since zsbmv is zgemv for symmetric band matrix + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * param alpha - scaling factor for the matrix-vector product + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b_test + * param beta - scaling factor for vector c_test + * param inc_c - stride of vector c_test + * param lda - specifies the leading dimension of a + * return norm of differences + */ +static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasint lda, + blasint inc_b, double *beta, blasint inc_c, blasint ldm) +{ + blasint i; + + // Trans param for gemv (can use any, since the input matrix is symmetric) + char trans = 'N'; + + // Symmetric band packed matrix for sbmv + double *a = (double*) malloc(lda * n * 2 * sizeof(double)); + + // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test + drand_generate(data_zsbmv.sp_matrix, n * (n + 1)); + drand_generate(data_zsbmv.b_test, n * inc_b * 2); + drand_generate(data_zsbmv.c_test, n * inc_c * 2); + + // Copy vector c_test for zgemv + for (i = 0; i < n * inc_c * 2; i++) + data_zsbmv.c_verify[i] = data_zsbmv.c_test[i]; + + // Generate full-storage symmetric band matrix + // with k super-diagonals from symmetric packed matrix + get_symmetric_band_matr(data_zsbmv.sb_matrix, data_zsbmv.sp_matrix, n, k); + + // Transform symmetric band matrix from conventional + // full matrix storage to band storage for zsbmv + transform_to_band_storage(uplo, n, k, a, lda, data_zsbmv.sb_matrix, ldm); + + BLASFUNC(zgemv)(&trans, &n, &n, alpha, data_zsbmv.sb_matrix, &ldm, data_zsbmv.b_test, + &inc_b, beta, data_zsbmv.c_verify, &inc_c); + + BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, + data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c); + + free(a); + // Find the differences between output vector caculated by zsbmv and zgemv + for (i = 0; i < n * inc_c * 2; i++) + data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i]; + + // Find the norm of differences + return BLASFUNC(dznrm2)(&n, data_zsbmv.c_test, &inc_c); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 0 + */ +CTEST(zsbmv, upper_k_0_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 1 + */ +CTEST(zsbmv, upper_k_1_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 1; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, upper_k_2_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, upper_k_2_inc_b_2_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 2 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, upper_k_2_inc_b_2_inc_c_2_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 2; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 0 + */ +CTEST(zsbmv, lower_k_0_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 1 + */ +CTEST(zsbmv, lower_k_1_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 1; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, lower_k_2_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, lower_k_2_inc_b_2_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 2 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, lower_k_2_inc_b_2_inc_c_2_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 2; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Check if output matrix a contains any NaNs + */ +CTEST(zsbmv, check_for_NaN) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + + ASSERT_TRUE(norm == norm); /* NaN == NaN is false */ +} + +/** + * Test error function for an invalid param uplo. + * Uplo specifies whether a is in upper (U) or lower (L) band-packed storage mode. + */ +CTEST(zsbmv, xerbla_uplo_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'O'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 1; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param N - + * number of rows and columns of A. Must be at least zero. + */ +CTEST(zsbmv, xerbla_n_invalid) +{ + blasint n = INVALID, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 2; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Check if n - number of rows and columns of A equal zero. + */ +CTEST(zsbmv, check_n_zero) +{ + blasint n = 0, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = 1; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test error function for an invalid param inc_b - + * stride of vector b_test. Can't be zero. + */ +CTEST(zsbmv, xerbla_inc_b_zero) +{ + blasint n = 1, inc_b = 0, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 8; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_c - + * stride of vector c_test. Can't be zero. + */ +CTEST(zsbmv, xerbla_inc_c_zero) +{ + blasint n = 1, inc_b = 1, inc_c = 0; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 11; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param k - + * number of super-diagonals of A. Must be at least zero. + */ +CTEST(zsbmv, xerbla_k_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = INVALID; + blasint lda = 1; + int expected_info = 3; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda - + * specifies the leading dimension of a. Must be at least (k+1). + */ +CTEST(zsbmv, xerbla_lda_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = INVALID; + int expected_info = 6; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_zscal.c b/utest/test_extensions/test_zscal.c new file mode 100644 index 000000000..132f4ee5b --- /dev/null +++ b/utest/test_extensions/test_zscal.c @@ -0,0 +1,165 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZSCAL { + double x_test[DATASIZE * 2 * INCREMENT]; + double x_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZSCAL data_zscal; + + +/** + * zscal reference code + * + * param n - number of elements of vector x + * param alpha - scaling factor for the vector product + * param x - buffer holding input vector x + * param inc - stride of vector x + */ +static void zscal_trusted(blasint n, double *alpha, double* x, blasint inc){ + blasint i, ip = 0; + blasint inc_x2 = 2 * inc; + double temp; + for (i = 0; i < n; i++) + { + temp = alpha[0] * x[ip] - alpha[1] * x[ip+1]; + x[ip+1] = alpha[0] * x[ip+1] + alpha[1] * x[ip]; + x[ip] = temp; + ip += inc_x2; + } +} + +/** + * Comapare results computed by zscal and zscal_trusted + * + * param api specifies tested api (C or Fortran) + * param n - number of elements of vector x + * param alpha - scaling factor for the vector product + * param inc - stride of vector x + * return norm of differences + */ +static double check_zscal(char api, blasint n, double *alpha, blasint inc) +{ + blasint i; + + // Fill vectors x + drand_generate(data_zscal.x_test, n * inc * 2); + + // Copy vector x for zscal_trusted + for (i = 0; i < n * 2 * inc; i++) + data_zscal.x_verify[i] = data_zscal.x_test[i]; + + zscal_trusted(n, alpha, data_zscal.x_verify, inc); + + if(api == 'F') + BLASFUNC(zscal)(&n, alpha, data_zscal.x_test, &inc); + else + cblas_zscal(n, alpha, data_zscal.x_test, inc); + + // Find the differences between output vector computed by zscal and zscal_trusted + for (i = 0; i < n * 2 * inc; i++) + data_zscal.x_verify[i] -= data_zscal.x_test[i]; + + // Find the norm of differences + return BLASFUNC(dznrm2)(&n, data_zscal.x_verify, &inc); +} + +/** + * Fortran API specific test + * Test zscal by comparing it against reference + */ +CTEST(zscal, alpha_r_zero_alpha_i_not_zero) +{ + blasint N = DATASIZE; + blasint inc = 1; + double alpha[2] = {0.0, 1.0}; + + double norm = check_zscal('F', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zscal by comparing it against reference + */ +CTEST(zscal, alpha_r_zero_alpha_i_zero_inc_2) +{ + blasint N = DATASIZE; + blasint inc = 2; + double alpha[2] = {0.0, 0.0}; + + double norm = check_zscal('F', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zscal by comparing it against reference + */ +CTEST(zscal, c_api_alpha_r_zero_alpha_i_not_zero) +{ + blasint N = DATASIZE; + blasint inc = 1; + double alpha[2] = {0.0, 1.0}; + + double norm = check_zscal('C', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zscal by comparing it against reference + */ +CTEST(zscal, c_api_alpha_r_zero_alpha_i_zero_inc_2) +{ + blasint N = DATASIZE; + blasint inc = 2; + double alpha[2] = {0.0, 0.0}; + + double norm = check_zscal('C', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zspmv.c b/utest/test_extensions/test_zspmv.c new file mode 100644 index 000000000..510ac0579 --- /dev/null +++ b/utest/test_extensions/test_zspmv.c @@ -0,0 +1,427 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZSPMV { + double a_verify[DATASIZE * DATASIZE * 2]; + double a_test[DATASIZE * (DATASIZE + 1)]; + double b_test[DATASIZE * 2 * INCREMENT]; + double c_test[DATASIZE * 2 * INCREMENT]; + double c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZSPMV data_zspmv; + +/** + * Compute spmv via gemv since spmv is gemv for symmetric packed matrix + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param alpha - scaling factor for the matrix-vector product + * param a - buffer holding input matrix A + * param b - Buffer holding input vector b + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param c - buffer holding input/output vector c + * param inc_c - stride of vector c + * output param data_zspmv.c_verify - matrix computed by gemv + */ +static void zspmv_trusted(char uplo, blasint n, double *alpha, double *a, + double *b, blasint inc_b, double *beta, double *c, + blasint inc_c) +{ + blasint k; + blasint i, j; + + // param for gemv (can use any, since the input matrix is symmetric) + char trans = 'N'; + + // Unpack the input symmetric packed matrix + if (uplo == 'L') + { + k = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < n * 2; j += 2) + { + if (j / 2 < i) + { + data_zspmv.a_verify[i * n * 2 + j] = + data_zspmv.a_verify[j * n + i * 2]; + data_zspmv.a_verify[i * n * 2 + j + 1] = + data_zspmv.a_verify[j * n + i * 2 + 1]; + } + else + { + data_zspmv.a_verify[i * n * 2 + j] = a[k++]; + data_zspmv.a_verify[i * n * 2 + j + 1] = a[k++]; + } + } + } + } + else + { + k = n * (n + 1) - 1; + for (j = 2 * n - 1; j >= 0; j -= 2) + { + for (i = n - 1; i >= 0; i--) + { + if (j / 2 < i) + { + data_zspmv.a_verify[i * n * 2 + j] = + data_zspmv.a_verify[(j - 1) * n + i * 2 + 1]; + data_zspmv.a_verify[i * n * 2 + j - 1] = + data_zspmv.a_verify[(j - 1) * n + i * 2]; + } + else + { + data_zspmv.a_verify[i * n * 2 + j] = a[k--]; + data_zspmv.a_verify[i * n * 2 + j - 1] = a[k--]; + } + } + } + } + + // Run gemv with unpacked matrix + BLASFUNC(zgemv)(&trans, &n, &n, alpha, data_zspmv.a_verify, &n, b, + &inc_b, beta, c, &inc_c); +} + +/** + * Comapare results computed by zspmv and zspmv_trusted + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param alpha - scaling factor for the matrix-vector product + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static double check_zspmv(char uplo, blasint n, double *alpha, blasint inc_b, + double *beta, blasint inc_c) +{ + blasint i; + + // Fill symmetric packed maxtix a, vectors b and c + drand_generate(data_zspmv.a_test, n * (n + 1)); + drand_generate(data_zspmv.b_test, 2 * n * inc_b); + drand_generate(data_zspmv.c_test, 2 * n * inc_c); + + // Copy vector c for zspmv_trusted + for (i = 0; i < n * 2 * inc_c; i++) + data_zspmv.c_verify[i] = data_zspmv.c_test[i]; + + zspmv_trusted(uplo, n, alpha, data_zspmv.a_test, data_zspmv.b_test, + inc_b, beta, data_zspmv.c_verify, inc_c); + BLASFUNC(zspmv)(&uplo, &n, alpha, data_zspmv.a_test, data_zspmv.b_test, + &inc_b, beta, data_zspmv.c_test, &inc_c); + + // Find the differences between output vector caculated by zspmv and zspmv_trusted + for (i = 0; i < n * 2 * inc_c; i++) + data_zspmv.c_test[i] -= data_zspmv.c_verify[i]; + + // Find the norm of differences + return BLASFUNC(dznrm2)(&n, data_zspmv.c_test, &inc_c); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param inc_b - stride of vector b + * param inc_c - stride of vector c + * param expected_info - expected invalid parameter number in zspmv + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char uplo, blasint n, blasint inc_b, + blasint inc_c, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + set_xerbla("ZSPMV ", expected_info); + + BLASFUNC(zspmv)(&uplo, &n, alpha, data_zspmv.a_test, data_zspmv.b_test, + &inc_b, beta, data_zspmv.c_test, &inc_c); + + return check_error(); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zspmv, upper_inc_b_1_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 2 + */ +CTEST(zspmv, upper_inc_b_1_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 2; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 1 + */ +CTEST(zspmv, upper_inc_b_2_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 1; + char uplo = 'U'; + double alpha[] = {1.0, 0.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(zspmv, upper_inc_b_2_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 2; + char uplo = 'U'; + double alpha[] = {2.5, -2.1}; + double beta[] = {0.0, 1.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zspmv, lower_inc_b_1_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 2 + */ +CTEST(zspmv, lower_inc_b_1_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 2; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 1 + */ +CTEST(zspmv, lower_inc_b_2_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 1; + char uplo = 'L'; + double alpha[] = {1.0, 0.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(zspmv, lower_inc_b_2_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 2; + char uplo = 'L'; + double alpha[] = {2.5, -2.1}; + double beta[] = {0.0, 1.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Check if output matrix A contains any NaNs + */ +CTEST(zspmv, check_for_NaN) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_TRUE(norm == norm); /* NaN == NaN is false */ +} + +/** + * Test error function for an invalid param uplo. + * uplo specifies whether A is upper or lower triangular. + */ +CTEST(zspmv, xerbla_uplo_invalid) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param N - + * number of rows and columns of A. Must be at least zero. + */ +CTEST(zspmv, xerbla_N_invalid) +{ + blasint N = INVALID, inc_b = 1, inc_c = 1; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_b - + * stride of vector b. Can't be zero. + */ +CTEST(zspmv, xerbla_inc_b_zero) +{ + blasint N = DATASIZE, inc_b = 0, inc_c = 1; + char uplo = 'U'; + int expected_info = 6; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_c - + * stride of vector c. Can't be zero. + */ +CTEST(zspmv, xerbla_inc_c_zero) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 0; + char uplo = 'U'; + int expected_info = 9; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_ztrmv.c b/utest/test_extensions/test_ztrmv.c new file mode 100644 index 000000000..5668ec296 --- /dev/null +++ b/utest/test_extensions/test_ztrmv.c @@ -0,0 +1,266 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 300 +#define INCREMENT 2 + +struct DATA_ZTRMV { + double a_test[DATASIZE * DATASIZE * 2]; + double a_verify[DATASIZE * DATASIZE * 2]; + double x_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZTRMV data_ztrmv; + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * + * param uplo specifies whether A is upper or lower triangular + * param trans specifies op(A), the transposition (conjugation) operation applied to A + * param diag specifies whether the matrix A is unit triangular or not. + * param n - numbers of rows and columns of A + * param lda - leading dimension of matrix A + * param incx - increment for the elements of x + * return norm of difference + */ +static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx) +{ + blasint i; + double alpha_conj[] = {1.0, 0.0}; + char trans_verify = trans; + + drand_generate(data_ztrmv.a_test, n * lda * 2); + drand_generate(data_ztrmv.x_test, n * incx * 2); + + for (i = 0; i < n * lda * 2; i++) + data_ztrmv.a_verify[i] = data_ztrmv.a_test[i]; + + for (i = 0; i < n * incx * 2; i++) + data_ztrmv.x_verify[i] = data_ztrmv.x_test[i]; + + if (trans == 'R'){ + cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ztrmv.a_verify, lda, lda); + trans_verify = 'N'; + } + + BLASFUNC(ztrmv)(&uplo, &trans_verify, &diag, &n, data_ztrmv.a_verify, &lda, + data_ztrmv.x_verify, &incx); + + BLASFUNC(ztrmv)(&uplo, &trans, &diag, &n, data_ztrmv.a_test, &lda, + data_ztrmv.x_test, &incx); + + for (i = 0; i < n * incx * 2; i++) + data_ztrmv.x_verify[i] -= data_ztrmv.x_test[i]; + + return BLASFUNC(dznrm2)(&n, data_ztrmv.x_verify, &incx); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + */ +CTEST(ztrmv, conj_notrans_upper_not_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + */ +CTEST(ztrmv, conj_notrans_upper_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + */ +CTEST(ztrmv, conj_notrans_lower_not_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + */ +CTEST(ztrmv, conj_notrans_lower_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ztrmv, conj_notrans_upper_not_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ztrmv, conj_notrans_upper_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ztrmv, conj_notrans_lower_not_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ztrmv, conj_notrans_lower_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_ztrsv.c b/utest/test_extensions/test_ztrsv.c new file mode 100644 index 000000000..4b7ec6aaf --- /dev/null +++ b/utest/test_extensions/test_ztrsv.c @@ -0,0 +1,267 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 300 +#define INCREMENT 2 + +struct DATA_ZTRSV { + double a_test[DATASIZE * DATASIZE * 2]; + double a_verify[DATASIZE * DATASIZE * 2]; + double x_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZTRSV data_ztrsv; + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * + * param uplo specifies whether A is upper or lower triangular + * param trans specifies op(A), the transposition (conjugation) operation applied to A + * param diag specifies whether the matrix A is unit triangular or not. + * param n - numbers of rows and columns of A + * param lda - leading dimension of matrix A + * param incx - increment for the elements of x + * return norm of difference + */ +static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx) +{ + blasint i; + double alpha_conj[] = {1.0, 0.0}; + char trans_verify = trans; + + drand_generate(data_ztrsv.a_test, n * lda * 2); + drand_generate(data_ztrsv.x_test, n * incx * 2); + + for (i = 0; i < n * lda * 2; i++) + data_ztrsv.a_verify[i] = data_ztrsv.a_test[i]; + + for (i = 0; i < n * incx * 2; i++) + data_ztrsv.x_verify[i] = data_ztrsv.x_test[i]; + + if (trans == 'R'){ + cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, + alpha_conj, data_ztrsv.a_verify, lda, lda); + trans_verify = 'N'; + } + + BLASFUNC(ztrsv)(&uplo, &trans_verify, &diag, &n, data_ztrsv.a_verify, + &lda, data_ztrsv.x_verify, &incx); + + BLASFUNC(ztrsv)(&uplo, &trans, &diag, &n, data_ztrsv.a_test, &lda, + data_ztrsv.x_test, &incx); + + for (i = 0; i < n * incx * 2; i++) + data_ztrsv.x_verify[i] -= data_ztrsv.x_test[i]; + + return BLASFUNC(dznrm2)(&n, data_ztrsv.x_verify, &incx); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + */ +CTEST(ztrsv, conj_notrans_upper_not_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + */ +CTEST(ztrsv, conj_notrans_upper_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + */ +CTEST(ztrsv, conj_notrans_lower_not_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + */ +CTEST(ztrsv, conj_notrans_lower_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ztrsv, conj_notrans_upper_not_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ztrsv, conj_notrans_upper_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ztrsv, conj_notrans_lower_not_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ztrsv, conj_notrans_lower_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/xerbla.c b/utest/test_extensions/xerbla.c new file mode 100644 index 000000000..9487b20a6 --- /dev/null +++ b/utest/test_extensions/xerbla.c @@ -0,0 +1,88 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +static int link_xerbla=TRUE; +static int lerr, _info, ok; +static char *rout; + +static void F77_xerbla(char *srname, void *vinfo) +{ + int info=*(int*)vinfo; + + if (link_xerbla) + { + link_xerbla = 0; + return; + } + + if (rout != NULL && strcmp(rout, srname) != 0){ + printf("***** XERBLA WAS CALLED WITH SRNAME = <%s> INSTEAD OF <%s> *******\n", srname, rout); + ok = FALSE; + } + + if (info != _info){ + printf("***** XERBLA WAS CALLED WITH INFO = %d INSTEAD OF %d in %s *******\n",info, _info, srname); + lerr = TRUE; + ok = FALSE; + } else lerr = FALSE; +} + +/** +* error function redefinition +*/ +int BLASFUNC(xerbla)(char *name, blasint *info, blasint length) +{ + F77_xerbla(name, info); + return 0; +} + +int check_error(void) { + if (lerr == TRUE ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", _info, rout); + ok = FALSE; + } + lerr = TRUE; + return ok; +} + +void set_xerbla(char* current_rout, int expected_info){ + if (link_xerbla) /* call these first to link */ + F77_xerbla(rout, &_info); + + ok = TRUE; + lerr = TRUE; + _info = expected_info; + rout = current_rout; +} \ No newline at end of file diff --git a/utest/test_zscal.c b/utest/test_zscal.c new file mode 100644 index 000000000..ffc851e8b --- /dev/null +++ b/utest/test_zscal.c @@ -0,0 +1,104 @@ +#include "openblas_utest.h" +#include +#ifdef BUILD_COMPLEX16 + +#ifndef NAN +#define NAN 0.0/0.0 +#endif +#ifndef INFINITY +#define INFINITY 1.0/0.0 +#endif + +CTEST(zscal, i_nan) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 1); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, i_nan_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, + NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 2); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, nan_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + +CTEST(zscal, nan_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + +CTEST(zscal, i_inf) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 1); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(zscal, i_inf_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, + INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 2); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(zscal, inf_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + +CTEST(zscal, inf_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + +#endif diff --git a/utest/utest_main2.c b/utest/utest_main2.c index 4382bf159..46a3b067d 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CTEST_SEGFAULT #define CTEST_ADD_TESTS_MANUALLY +#include "cblas.h" #include "openblas_utest.h" CTEST(amax, samax){ @@ -617,6 +618,51 @@ CTEST(max, smax_zero){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } + +CTEST(zscal, i_nan) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 1); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, nan_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); + } + +CTEST(zscal, i_inf) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 1); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(zscal, inf_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + int main(int argc, const char ** argv){ CTEST_ADD (amax, samax); @@ -648,7 +694,10 @@ int main(int argc, const char ** argv){ CTEST_ADD (swap,zswap_inc_0); CTEST_ADD (swap,sswap_inc_0); CTEST_ADD (swap,cswap_inc_0); - + CTEST_ADD (zscal, i_nan); + CTEST_ADD (zscal, nan_i); + CTEST_ADD (zscal, i_inf); + CTEST_ADD (zscal, inf_i); int num_fail=0; num_fail=ctest_main(argc, argv);